def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred """ sys.stderr.write("rpart fitting and predicting...\n") r.library("rpart") coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) #11-17-05 transform into array all_data = array(all_data) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]}) if prior_prob: prior_prob = [prior_prob, 1-prior_prob] #get the full list fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) ) else: fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(loss=r.matrix(loss_matrix) ) ) set_default_mode(BASIC_CONVERSION) pred_training = r.predict(fit, data_frame, type=["class"]) del data_frame set_default_mode(NO_CONVERSION) all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \ "cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]}) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit, all_data_frame, type=["class"]) #11-17-05 type=c("class") del all_data_frame sys.stderr.write("Done rpart fitting and predicting.\n") return pred, pred_training
def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3): times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output. SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"] for tag in tag_list: self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(y=self.y[tag],X=self.X[tag]) model = "y~X-1" # Use -1 because X has an intercept already if self.regtype=="Independent Linear": try: result = rc.lm(model,data=data) except: pdb.set_trace() elif self.regtype=="Independent Logistic": result = rc.glm(model,family=rc.binomial("logit"),data=data) rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode. summary = rc.summary(result,correlation=rc.TRUE) self._record_regression_stats(tag, summary) beta_dict = dict() sorted_sources = self.sorted_sources[tag] coeff_matrix = summary["coefficients"] for i in range(len(sorted_sources)): try: cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:])) except IndexError: util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag) if remove_tags_when_bad_regression: self._remove_tag(tag) break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features.... continue try: cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10) except OverflowError: pass beta_dict[sorted_sources[i]] = cur_source_dict if tag not in self.features: # We've removed this tag a few lines above, so skip it. continue self.beta[tag] = beta_dict if times_showed_summary < n_times_show_summary: self._print_regression_summary(tag, summary) times_showed_summary += 1
def _mcmc_betas_same_sources(self, tag_list): """ The given tag_list contains tags that all have the same features available. Train on the tags in tag_list using only the songs in self.only_these_songs, or all available songs if self.only_these_songs is None. """ if not self.production_run: self.mcmc_reps = 75 # save time rc.library("bayesm") data = [] for tag in tag_list: data.append(rc.list(X=self.X[tag],y=self.y[tag])) rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(*data) if self.regtype in ["Hierarchical Linear", "Hierarchical Mixture"]: Data = rc.list(regdata=data) elif self.regtype=="Hierarchical Logistic": Data = rc.list(lgtdata=data) if self.regtype=="Hierarchical Mixture": Prior = rc.list(ncomp=self.ncomp) Mcmc=rc.list(R=self.mcmc_reps) rpy.set_default_mode(rpy.BASIC_CONVERSION) try: if self.regtype=="Hierarchical Linear": output = rc.rhierLinearModel(Data=Data,Mcmc=Mcmc) elif self.regtype=="Hierarchical Logistic": output = rc.rhierBinLogit(Data=Data,Mcmc=Mcmc) elif self.regtype=="Hierarchical Mixture": output = rc.rhierLinearMixture(Data=Data,Prior=Prior,Mcmc=Mcmc) except: #pdb.set_trace() self._info_about_r_error(tag_list) return beta_matrix = output['betadraw'].mean(axis=2) # nregressions x ncoeffs, averaged along third dim matrix_index = 0 for tag in tag_list: cur_tag_beta_vec = beta_matrix[matrix_index,:] beta_dict_list = [dict([('beta', coeff)]) for coeff in cur_tag_beta_vec] self.beta[tag] = dict(zip(self.sorted_sources[tag],beta_dict_list)) self.stats[tag] = dict() # I'm not currently storing any stats for hierarchical regressions. matrix_index += 1
def rpart_fit(self, known_data, parameter_list, bit_string="11111"): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred 11-23-05 split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict() 11-27-05 r cleanup 03-17-06 use parameter_list instead """ if self.debug: sys.stderr.write("Doing rpart_fit...\n") # 03-17-06 rpart_cp, loss_matrix, prior_prob = parameter_list # 11-27-05 r cleanup from rpy import r r.library("rpart") coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) # 11-17-05 transform into array known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "is_correct": known_data[:, -1], } ) if prior_prob: prior_prob = [prior_prob, 1 - prior_prob] # get the full list fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)), ) else: fit = r.rpart( r("is_correct~%s" % "+".join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp), parms=r.list(loss=r.matrix(loss_matrix)), ) del data_frame if self.debug: sys.stderr.write("Done rpart_fit.\n") return fit