Ejemplo n.º 1
0
	def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'):
		"""
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		"""
		sys.stderr.write("rpart fitting and predicting...\n")
		r.library("rpart")
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		#11-17-05 transform into array
		all_data = array(all_data)
		known_data = array(known_data)
		
		set_default_mode(NO_CONVERSION)
		data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \
			"cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]})
		if prior_prob:
			prior_prob = [prior_prob, 1-prior_prob]	#get the full list
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) )
		else:
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(loss=r.matrix(loss_matrix) ) )
		
		set_default_mode(BASIC_CONVERSION)
		pred_training = r.predict(fit, data_frame, type=["class"])
		del data_frame
		
		set_default_mode(NO_CONVERSION)
		all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \
			"cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]})
		set_default_mode(BASIC_CONVERSION)
		pred = r.predict(fit, all_data_frame, type=["class"])	#11-17-05 type=c("class")
		del all_data_frame
		sys.stderr.write("Done rpart fitting and predicting.\n")
		return pred, pred_training
 def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3):
     times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output.
     SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"]
     for tag in tag_list:
         self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false
         rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
         data = rc.list(y=self.y[tag],X=self.X[tag])
         model = "y~X-1" # Use -1 because X has an intercept already
         if self.regtype=="Independent Linear":
             try:
                 result = rc.lm(model,data=data)
             except:
                 pdb.set_trace()
         elif self.regtype=="Independent Logistic":
             result = rc.glm(model,family=rc.binomial("logit"),data=data)
         rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode.
         summary = rc.summary(result,correlation=rc.TRUE)
         self._record_regression_stats(tag, summary)
         beta_dict = dict()
         sorted_sources = self.sorted_sources[tag]
         coeff_matrix = summary["coefficients"]
         for i in range(len(sorted_sources)):
             try:
                 cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:]))
             except IndexError:
                 util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag)
                 if remove_tags_when_bad_regression:
                     self._remove_tag(tag)
                     break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features....
                 continue
             try:
                 cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10)
             except OverflowError:
                 pass
             beta_dict[sorted_sources[i]] = cur_source_dict
         if tag not in self.features: # We've removed this tag a few lines above, so skip it.
             continue
         self.beta[tag] = beta_dict
         if times_showed_summary < n_times_show_summary:
             self._print_regression_summary(tag, summary)
             times_showed_summary += 1
 def _mcmc_betas_same_sources(self, tag_list):
     """
     The given tag_list contains tags that all have the same features
     available. Train on the tags in tag_list using only the songs
     in self.only_these_songs, or all available songs if
     self.only_these_songs is None.
     """
     if not self.production_run:
         self.mcmc_reps = 75 # save time
     rc.library("bayesm")
     data = []
     for tag in tag_list:
         data.append(rc.list(X=self.X[tag],y=self.y[tag]))
     rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
     data = rc.list(*data)
     if self.regtype in ["Hierarchical Linear", "Hierarchical Mixture"]:
         Data = rc.list(regdata=data)
     elif self.regtype=="Hierarchical Logistic":
         Data = rc.list(lgtdata=data)
     if self.regtype=="Hierarchical Mixture":
         Prior = rc.list(ncomp=self.ncomp)
     Mcmc=rc.list(R=self.mcmc_reps)
     rpy.set_default_mode(rpy.BASIC_CONVERSION)
     try:
         if self.regtype=="Hierarchical Linear":
             output = rc.rhierLinearModel(Data=Data,Mcmc=Mcmc)
         elif self.regtype=="Hierarchical Logistic":
             output = rc.rhierBinLogit(Data=Data,Mcmc=Mcmc)
         elif self.regtype=="Hierarchical Mixture":
             output = rc.rhierLinearMixture(Data=Data,Prior=Prior,Mcmc=Mcmc)
     except:
         #pdb.set_trace()
         self._info_about_r_error(tag_list)
         return
     beta_matrix = output['betadraw'].mean(axis=2) # nregressions x ncoeffs, averaged along third dim
     matrix_index = 0
     for tag in tag_list:
         cur_tag_beta_vec = beta_matrix[matrix_index,:]
         beta_dict_list = [dict([('beta', coeff)]) for coeff in cur_tag_beta_vec]
         self.beta[tag] = dict(zip(self.sorted_sources[tag],beta_dict_list))
         self.stats[tag] = dict() # I'm not currently storing any stats for hierarchical regressions.
         matrix_index += 1
Ejemplo n.º 4
0
    def rpart_fit(self, known_data, parameter_list, bit_string="11111"):
        """
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		11-23-05
			split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict()
		11-27-05
			r cleanup
		03-17-06
			use parameter_list instead
		"""
        if self.debug:
            sys.stderr.write("Doing rpart_fit...\n")
            # 03-17-06
        rpart_cp, loss_matrix, prior_prob = parameter_list

        # 11-27-05 r cleanup
        from rpy import r

        r.library("rpart")

        coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"]
        formula_list = []
        for i in range(len(bit_string)):
            if bit_string[i] == "1":
                formula_list.append(coeff_name_list[i])
                # 11-17-05 transform into array
        known_data = array(known_data)

        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": known_data[:, 0],
                "recurrence": known_data[:, 1],
                "connectivity": known_data[:, 2],
                "cluster_size": known_data[:, 3],
                "gradient": known_data[:, 4],
                "is_correct": known_data[:, -1],
            }
        )
        if prior_prob:
            prior_prob = [prior_prob, 1 - prior_prob]  # get the full list
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)),
            )
        else:
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(loss=r.matrix(loss_matrix)),
            )
        del data_frame
        if self.debug:
            sys.stderr.write("Done rpart_fit.\n")
        return fit