def _cur_tag_X_and_y(self, tag, test_random=False, all_ground_truth_binary=True):
     """
     """
     sorted_sources = self.sorted_sources[tag]
     cur_song_list = self.song_lists[tag]
     features_dict = self.features[tag]
     y = rc.matrix(self._dict_to_vec(self.ground_truth[tag], self.song_lists[tag]))
     if util.num_nonzeros(y) < self.min_tag_count:
         self._remove_tag(tag, verbosity=0)
         return
     # Possibly add intercept.
     ncol = 0
     x_vec = []
     if "intercept" in sorted_sources:
         ncol += 1
         x_vec.extend([1.0 for songid in cur_song_list])
     # Add Scrobble counts
     scrobble_vec = numpy.array([self.log_scrobble_counts.get(songid, self.avg_scrobble) for songid in cur_song_list])
     if "scrobble" in sorted_sources:
         ncol += 1
         x_vec.extend(self._standardize(scrobble_vec))
     # Add remaining features.
     for source in sorted_sources:
         if source.endswith("_interaction"):
             main_source = util.remove_trailing_string(source, "_interaction")
             main_vec = self._dict_to_vec(features_dict.get(main_source, None), cur_song_list)
             feature_vec = numpy.multiply(main_vec, scrobble_vec) # pointwise product
         elif source not in ["intercept", "scrobble"]:
             feature_vec = self._dict_to_vec(features_dict.get(source, None), cur_song_list)
         else:
             continue
         x_vec.extend(self._standardize(feature_vec))
         ncol += 1
     try:
         X = rc.matrix(x_vec,ncol=ncol)#dimnames=[[],x_sources])
     except:
         pdb.set_trace()
         # Temporary fix: rmme!
         self._remove_tag(tag)
         return
     if test_random and not self.production_run: # Erase all of the above and do some random numbers for testing.
         n_songs = len(X)
         X = numpy.random.standard_normal((n_songs,ncol))
         if ncol==1:
             y = 3 * X[:,0] + 0.5*numpy.random.standard_normal((1,n_songs))
         else:
             y = 3 * X[:,0] + X[:,1] + 0.5*numpy.random.standard_normal((1,n_songs))
         y = y.transpose()
     if all_ground_truth_binary or self.regtype=="Independent Logistic" or self.regtype=="Hierarchical Logistic":
         # Convert y to 0's and 1's.
         y = 1.0*(numpy.array(y)>0) # multiply by 1.0 to make Float
     self.X[tag] = X
     self.y[tag] = y
    def plotNumLegend(self, colVecL, breakL, nb_breaks, filename=None, legendDir=None, type='png', int_labels=False):

        if filename is None:
            filename = 'legend_%i_%i_%i' % (len(colVecL), min(breakL), max(breakL))

        if legendDir is None:
            legendDir = self.legendDir
            
        full_filename = os.path.join(legendDir, filename)

        max_break = max(breakL)
        min_break = min(breakL)
        tickBreakL = [float(x) / (nb_breaks - 1) * (max_break - min_break) - min_break for x in range(nb_breaks)]
        if int_labels:
            labels = ['%i' % int(x) for x in tickBreakL]
        else:
            labels = ['%.3f' % x for x in tickBreakL]
        
        rdev = plot_utilities.RDevice(name = full_filename, title='', plotType=type, 
            width=640, height=120)
        r("par(mar=c(3,1,0,1), las=2)")
        #legendA = rpy.reshape(breakL, (len(breakL), 1))
        legendA = r.matrix(breakL, byrow=False, ncol=1)
        r.image(legendA, 1, legendA, col=colVecL, axes=False, ann=False)
        r.box()
        r.axis(1, at=tickBreakL, labels=labels, tick=True, line=0, cex=0.8, cex_axis=0.8)
        rdev.close()
    
        return
Esempio n. 3
0
def matrix_regression(input_vals, output_vals, funcs, categorical = False):
    '''Linear regression on a matrix of input properties and input values.
    Produces a linear model for factors influencing output values.
    Functions must be tuples (title, func).
    Input_vals and output_vals must be the same size
    '''

    assert(len(input_vals) == len(output_vals))

    #Add a constant function for the intercept
    if 'Intercept' not in map(lambda x: x[0], funcs): funcs.append(('Intercept', lambda x: 1))

    #Build up a matrix of each input function on each input val
    m = r.matrix(1.0, nrow=len(input_vals), ncol=len(funcs))
    
    for i in xrange(len(input_vals)):
        for j in xrange(len(funcs)):
            m[i][j] = funcs[j][1](i)
            if categorical and m[i][j]: m[i][j] = 1
            
    #Regression, done in R
    fit = r.lsfit(m,output_vals, intercept = False, )

    #Extract values
    coefficients = map(lambda x: x[1], sorted(fit['coefficients'].items(), key = lambda x: int(x[0][1:])))
    regression_func = lambda x,c=coefficients: sum([c[j] * funcs[j][1](x) for j in xrange(len(funcs))])
    model_vals =  map(regression_func, input_vals)
    least_squares_quality = least_squares_error(model_vals,output_vals)
    linear_quality = linear_error(model_vals,output_vals)

    return coefficients, regression_func, model_vals, least_squares_quality, linear_quality
Esempio n. 4
0
def MK_fisher_pvalue(win_snp, win_div, AR_snp, AR_div):

    if win_snp == 0 and win_div == 0 and AR_snp == 0 and AR_div == 0:
        return 1.0

    fisher_result = r.fisher_test(r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2))

    return fisher_result['p.value']
Esempio n. 5
0
	def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'):
		"""
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		"""
		sys.stderr.write("rpart fitting and predicting...\n")
		r.library("rpart")
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		#11-17-05 transform into array
		all_data = array(all_data)
		known_data = array(known_data)
		
		set_default_mode(NO_CONVERSION)
		data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \
			"cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]})
		if prior_prob:
			prior_prob = [prior_prob, 1-prior_prob]	#get the full list
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) )
		else:
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(loss=r.matrix(loss_matrix) ) )
		
		set_default_mode(BASIC_CONVERSION)
		pred_training = r.predict(fit, data_frame, type=["class"])
		del data_frame
		
		set_default_mode(NO_CONVERSION)
		all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \
			"cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]})
		set_default_mode(BASIC_CONVERSION)
		pred = r.predict(fit, all_data_frame, type=["class"])	#11-17-05 type=c("class")
		del all_data_frame
		sys.stderr.write("Done rpart fitting and predicting.\n")
		return pred, pred_training
    def prepareData(self, resD, getLabel, expL, flip_axis=False):
        
        nb_exp = len(expL)
        hitValueL = [0 for x in range(nb_exp)]

        for i in range(len(expL)):
            id = expL[i]
            if resD.has_key(id):
                label = getLabel(resD[id])
            else:
                label = 0
            hitValueL[i] = label

        #hitData = rpy.reshape(hitValueL, (self.nb_col, self.nb_row))        
        if flip_axis:
            hitData = r.matrix(hitValueL, nrow=self.nb_col, byrow=False)        
        else:
            hitData = r.matrix(hitValueL, nrow=self.nb_row, byrow=True)        
        #hitData = r("function(x,y){matrix(x, byrow=TRUE, nrow=y)[y:1,]}")(hitValueL, self.nb_row)

        return hitData    
    def prepareData(self, resD, getLabel, expL):
        
        nb_exp = len(expL)
        hitValueL = [0 for x in range(nb_exp)]

        for i in range(len(expL)):
            id = expL[i]
            if resD.has_key(id):
                label = getLabel(resD[id])
            else:
                label = 0
            hitValueL[i] = label

        #hitData = rpy.reshape(hitValueL, (self.nb_col, self.nb_row))        
        hitData = r.matrix(hitValueL, nrow=self.nb_row, byrow=True)
        
        return hitData    
Esempio n. 8
0
    def rpart_fit(self, known_data, parameter_list, bit_string="11111"):
        """
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		11-23-05
			split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict()
		11-27-05
			r cleanup
		03-17-06
			use parameter_list instead
		"""
        if self.debug:
            sys.stderr.write("Doing rpart_fit...\n")
            # 03-17-06
        rpart_cp, loss_matrix, prior_prob = parameter_list

        # 11-27-05 r cleanup
        from rpy import r

        r.library("rpart")

        coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"]
        formula_list = []
        for i in range(len(bit_string)):
            if bit_string[i] == "1":
                formula_list.append(coeff_name_list[i])
                # 11-17-05 transform into array
        known_data = array(known_data)

        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": known_data[:, 0],
                "recurrence": known_data[:, 1],
                "connectivity": known_data[:, 2],
                "cluster_size": known_data[:, 3],
                "gradient": known_data[:, 4],
                "is_correct": known_data[:, -1],
            }
        )
        if prior_prob:
            prior_prob = [prior_prob, 1 - prior_prob]  # get the full list
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)),
            )
        else:
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(loss=r.matrix(loss_matrix)),
            )
        del data_frame
        if self.debug:
            sys.stderr.write("Done rpart_fit.\n")
        return fit
Esempio n. 9
0
def MK_chi_pvalue(win_snp, win_div, AR_snp, AR_div):

    chi_result = r.chisq_test(
        r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2))

    return chi_result['p.value']