def _cur_tag_X_and_y(self, tag, test_random=False, all_ground_truth_binary=True): """ """ sorted_sources = self.sorted_sources[tag] cur_song_list = self.song_lists[tag] features_dict = self.features[tag] y = rc.matrix(self._dict_to_vec(self.ground_truth[tag], self.song_lists[tag])) if util.num_nonzeros(y) < self.min_tag_count: self._remove_tag(tag, verbosity=0) return # Possibly add intercept. ncol = 0 x_vec = [] if "intercept" in sorted_sources: ncol += 1 x_vec.extend([1.0 for songid in cur_song_list]) # Add Scrobble counts scrobble_vec = numpy.array([self.log_scrobble_counts.get(songid, self.avg_scrobble) for songid in cur_song_list]) if "scrobble" in sorted_sources: ncol += 1 x_vec.extend(self._standardize(scrobble_vec)) # Add remaining features. for source in sorted_sources: if source.endswith("_interaction"): main_source = util.remove_trailing_string(source, "_interaction") main_vec = self._dict_to_vec(features_dict.get(main_source, None), cur_song_list) feature_vec = numpy.multiply(main_vec, scrobble_vec) # pointwise product elif source not in ["intercept", "scrobble"]: feature_vec = self._dict_to_vec(features_dict.get(source, None), cur_song_list) else: continue x_vec.extend(self._standardize(feature_vec)) ncol += 1 try: X = rc.matrix(x_vec,ncol=ncol)#dimnames=[[],x_sources]) except: pdb.set_trace() # Temporary fix: rmme! self._remove_tag(tag) return if test_random and not self.production_run: # Erase all of the above and do some random numbers for testing. n_songs = len(X) X = numpy.random.standard_normal((n_songs,ncol)) if ncol==1: y = 3 * X[:,0] + 0.5*numpy.random.standard_normal((1,n_songs)) else: y = 3 * X[:,0] + X[:,1] + 0.5*numpy.random.standard_normal((1,n_songs)) y = y.transpose() if all_ground_truth_binary or self.regtype=="Independent Logistic" or self.regtype=="Hierarchical Logistic": # Convert y to 0's and 1's. y = 1.0*(numpy.array(y)>0) # multiply by 1.0 to make Float self.X[tag] = X self.y[tag] = y
def plotNumLegend(self, colVecL, breakL, nb_breaks, filename=None, legendDir=None, type='png', int_labels=False): if filename is None: filename = 'legend_%i_%i_%i' % (len(colVecL), min(breakL), max(breakL)) if legendDir is None: legendDir = self.legendDir full_filename = os.path.join(legendDir, filename) max_break = max(breakL) min_break = min(breakL) tickBreakL = [float(x) / (nb_breaks - 1) * (max_break - min_break) - min_break for x in range(nb_breaks)] if int_labels: labels = ['%i' % int(x) for x in tickBreakL] else: labels = ['%.3f' % x for x in tickBreakL] rdev = plot_utilities.RDevice(name = full_filename, title='', plotType=type, width=640, height=120) r("par(mar=c(3,1,0,1), las=2)") #legendA = rpy.reshape(breakL, (len(breakL), 1)) legendA = r.matrix(breakL, byrow=False, ncol=1) r.image(legendA, 1, legendA, col=colVecL, axes=False, ann=False) r.box() r.axis(1, at=tickBreakL, labels=labels, tick=True, line=0, cex=0.8, cex_axis=0.8) rdev.close() return
def matrix_regression(input_vals, output_vals, funcs, categorical = False): '''Linear regression on a matrix of input properties and input values. Produces a linear model for factors influencing output values. Functions must be tuples (title, func). Input_vals and output_vals must be the same size ''' assert(len(input_vals) == len(output_vals)) #Add a constant function for the intercept if 'Intercept' not in map(lambda x: x[0], funcs): funcs.append(('Intercept', lambda x: 1)) #Build up a matrix of each input function on each input val m = r.matrix(1.0, nrow=len(input_vals), ncol=len(funcs)) for i in xrange(len(input_vals)): for j in xrange(len(funcs)): m[i][j] = funcs[j][1](i) if categorical and m[i][j]: m[i][j] = 1 #Regression, done in R fit = r.lsfit(m,output_vals, intercept = False, ) #Extract values coefficients = map(lambda x: x[1], sorted(fit['coefficients'].items(), key = lambda x: int(x[0][1:]))) regression_func = lambda x,c=coefficients: sum([c[j] * funcs[j][1](x) for j in xrange(len(funcs))]) model_vals = map(regression_func, input_vals) least_squares_quality = least_squares_error(model_vals,output_vals) linear_quality = linear_error(model_vals,output_vals) return coefficients, regression_func, model_vals, least_squares_quality, linear_quality
def MK_fisher_pvalue(win_snp, win_div, AR_snp, AR_div): if win_snp == 0 and win_div == 0 and AR_snp == 0 and AR_div == 0: return 1.0 fisher_result = r.fisher_test(r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2)) return fisher_result['p.value']
def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred """ sys.stderr.write("rpart fitting and predicting...\n") r.library("rpart") coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) #11-17-05 transform into array all_data = array(all_data) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]}) if prior_prob: prior_prob = [prior_prob, 1-prior_prob] #get the full list fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) ) else: fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(loss=r.matrix(loss_matrix) ) ) set_default_mode(BASIC_CONVERSION) pred_training = r.predict(fit, data_frame, type=["class"]) del data_frame set_default_mode(NO_CONVERSION) all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \ "cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]}) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit, all_data_frame, type=["class"]) #11-17-05 type=c("class") del all_data_frame sys.stderr.write("Done rpart fitting and predicting.\n") return pred, pred_training
def prepareData(self, resD, getLabel, expL, flip_axis=False): nb_exp = len(expL) hitValueL = [0 for x in range(nb_exp)] for i in range(len(expL)): id = expL[i] if resD.has_key(id): label = getLabel(resD[id]) else: label = 0 hitValueL[i] = label #hitData = rpy.reshape(hitValueL, (self.nb_col, self.nb_row)) if flip_axis: hitData = r.matrix(hitValueL, nrow=self.nb_col, byrow=False) else: hitData = r.matrix(hitValueL, nrow=self.nb_row, byrow=True) #hitData = r("function(x,y){matrix(x, byrow=TRUE, nrow=y)[y:1,]}")(hitValueL, self.nb_row) return hitData
def prepareData(self, resD, getLabel, expL): nb_exp = len(expL) hitValueL = [0 for x in range(nb_exp)] for i in range(len(expL)): id = expL[i] if resD.has_key(id): label = getLabel(resD[id]) else: label = 0 hitValueL[i] = label #hitData = rpy.reshape(hitValueL, (self.nb_col, self.nb_row)) hitData = r.matrix(hitValueL, nrow=self.nb_row, byrow=True) return hitData
def rpart_fit(self, known_data, parameter_list, bit_string="11111"): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred 11-23-05 split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict() 11-27-05 r cleanup 03-17-06 use parameter_list instead """ if self.debug: sys.stderr.write("Doing rpart_fit...\n") # 03-17-06 rpart_cp, loss_matrix, prior_prob = parameter_list # 11-27-05 r cleanup from rpy import r r.library("rpart") coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) # 11-17-05 transform into array known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "is_correct": known_data[:, -1], } ) if prior_prob: prior_prob = [prior_prob, 1 - prior_prob] # get the full list fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)), ) else: fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(loss=r.matrix(loss_matrix)), ) del data_frame if self.debug: sys.stderr.write("Done rpart_fit.\n") return fit
def MK_chi_pvalue(win_snp, win_div, AR_snp, AR_div): chi_result = r.chisq_test( r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2)) return chi_result['p.value']