def run(self): """ 2009-2-2 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session session.begin() PC_data = Association.getPCFromFile(self.eigen_vector_fname) eigen_value_ls = Association.getEigenValueFromFile( self.eigen_value_fname) eigen_value_ls = numpy.array(eigen_value_ls) explained_var = eigen_value_ls / numpy.sum(eigen_value_ls) self.putPCDataIntoDB(db, PC_data, self.call_method_id) self.putEigenValuesIntoDB(db, eigen_value_ls, explained_var, self.call_method_id) if self.commit: session.flush() session.commit()
def make_association_to_part(part_no, filename): if part_no == 1: association = Association(SentenceAssociationStrategy, filename) elif part_no == 2: association = Association(WindowAssociationStrategy, filename, arg=2) else: association = Association(DependencyEdgeAssocaition, filename) return association
def IntLMOnTwoSNPs(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \ min_data_point=3): """ 2009-2-8 interaction detection linear model y = b + SNP1xSNP2 + SNP1 + SNP2 + e interaction is the 1st term. therefore the pvalue directly returned is also for this term. """ return_ls = [] genotype_ls1 = numpy.resize(genotype_ls1, [len(genotype_ls1), 1]) genotype_ls2 = numpy.resize(genotype_ls2, [len(genotype_ls2), 1]) snp_int_matrix = genotype_ls1 * genotype_ls2 genotype_ls = numpy.hstack( (snp_int_matrix, genotype_ls1, genotype_ls2)) #interaction variable is the 1st position pdata = Association.linear_model(genotype_ls, phenotype_ls, min_data_point, snp_index=snp1_id + snp2_id) if pdata: pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\ count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=None, phenotype_index=phenotype_index,\ var_perc=pdata.var_perc, coeff_list=pdata.coeff_list, coeff_p_value_list=pdata.coeff_p_value_list) return_ls.append(pdata) return return_ls
def registerCustomExecutables(self, workflow=None): """ 2012.6.5 """ AbstractVariationWorkflow.registerCustomExecutables(self, workflow=workflow) namespace = self.namespace version = self.version operatingSystem = self.operatingSystem architecture = self.architecture clusters_size = self.clusters_size site_handler = self.site_handler executableClusterSizeMultiplierList = [ ] #2012.8.7 each cell is a tuple of (executable, clusterSizeMultipler (0 if u do not need clustering) Association = Executable(namespace=namespace, name="Association", version=version, \ os=operatingSystem, arch=architecture, installed=True) Association.addPFN( PFN( "file://" + os.path.join(self.variationSrcPath, "association/Association.py"), site_handler)) executableClusterSizeMultiplierList.append((Association, 0)) Results2DB_250k = Executable(namespace=namespace, name="Results2DB_250k", version=version, \ os=operatingSystem, arch=architecture, installed=True) Results2DB_250k.addPFN( PFN( "file://" + os.path.join(self.variationSrcPath, "db/Results2DB_250k.py"), site_handler)) executableClusterSizeMultiplierList.append((Results2DB_250k, 0)) OutputPhenotype = Executable(namespace=namespace, name="OutputPhenotype", version=version, \ os=operatingSystem, arch=architecture, installed=True) OutputPhenotype.addPFN( PFN( "file://" + os.path.join(self.variationSrcPath, "db/output/OutputPhenotype.py"), site_handler)) executableClusterSizeMultiplierList.append((OutputPhenotype, 0)) self.addExecutableAndAssignProperClusterSize( executableClusterSizeMultiplierList, defaultClustersSize=self.clusters_size)
def IntLMOnTwoSNPs(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \ min_data_point=3): """ 2009-2-8 interaction detection linear model y = b + SNP1xSNP2 + SNP1 + SNP2 + e interaction is the 1st term. therefore the pvalue directly returned is also for this term. """ return_ls = [] genotype_ls1 = numpy.resize(genotype_ls1, [len(genotype_ls1),1]) genotype_ls2 = numpy.resize(genotype_ls2, [len(genotype_ls2),1]) snp_int_matrix = genotype_ls1*genotype_ls2 genotype_ls = numpy.hstack((snp_int_matrix, genotype_ls1, genotype_ls2)) #interaction variable is the 1st position pdata = Association.linear_model(genotype_ls, phenotype_ls, min_data_point, snp_index=snp1_id+snp2_id) if pdata: pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\ count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=None, phenotype_index=phenotype_index,\ var_perc=pdata.var_perc, coeff_list=pdata.coeff_list, coeff_p_value_list=pdata.coeff_p_value_list) return_ls.append(pdata) return return_ls
def subplotLatLonPhenVsPC(self, ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, \ phenotype_cmap, phenotype_norm, which_figure=0, sub_title='',\ ydata='latitude', which_PC_index=0, no_of_rows=2, dot_size=10, alpha=0.7): """ 2008-12-08 one single subplot in plotLatLonPhenVsPC() """ ax = pylab.subplot(no_of_rows,2,which_figure, frameon=False) pylab.ylabel(ydata) pylab.grid(True, alpha=0.3) x_ls = [] y_ls = [] for strain_id in StrainID2PCAPosInfo.strain_id_ls: ecotype_id = int(strain_id) ecotype_obj = ecotype_info.ecotype_id2ecotype_obj.get(ecotype_id) if ecotype_obj: lat, lon = ecotype_obj.latitude, ecotype_obj.longitude y_value = getattr(ecotype_obj, ydata, None) else: sys.stderr.write("Warning: Ecotype %s not in ecotype_info (fetched from stock db).\n"%ecotype_id) continue #x_value = StrainID2PCAPosInfo.PC_matrix[row_index][which_PC_index] if which_PC_index==1: x_value = StrainID2PCAPosInfo.strain_id2pca_x[strain_id] xlabel = 'PC%s'%(which_PC_index+1) elif which_PC_index==0: x_value = StrainID2PCAPosInfo.strain_id2pca_y[strain_id] xlabel = 'PC%s'%(which_PC_index+1) elif which_PC_index=='longitude': x_value = lon xlabel = which_PC_index elif which_PC_index =='latitude': x_value = lat xlabel = which_PC_index #img_y_pos = StrainID2PCAPosInfo.strain_id2img_y_pos[strain_id] #strain color according to phenotype phenotype_row_index = phenData.row_id2row_index[strain_id] phenotype = phenData.data_matrix[phenotype_row_index][phenotype_col_index] strain_fc = phenotype_cmap(phenotype_norm(phenotype)) if numpy.isnan(phenotype): linewidth=0.5 strain_fc = 'w' edgecolor = 'k' _alpha = 0 #facecolor gets very transparent else: linewidth=0 strain_fc = strain_fc edgecolor = 'k' _alpha = alpha if ydata=='phenotype': y_value = phenotype if numpy.isnan(phenotype): #can't do regression or plot continue if y_value is None or numpy.isnan(y_value): continue pylab.scatter([x_value],[y_value], s=dot_size, linewidth=linewidth, facecolor=strain_fc, alpha=_alpha, zorder=10) x_ls.append(x_value) y_ls.append(y_value) xlim = ax.get_xlim() ylim = ax.get_ylim() lm_result = Association.pure_linear_model(x_ls, y_ls) try: pvalue = '%.2f'%-math.log10(lm_result.pvalue) except: pvalue = '%s'%lm_result.pvalue beta0 = lm_result.coeff_list[0] beta = lm_result.coeff_list[1] #draw a line showing the trend x_lm_ls = [min(x_ls), max(x_ls)] lm_func = lambda x: lm_result.coeff_list[0]+lm_result.coeff_list[1]*x y_lm_ls = map(lm_func, x_lm_ls) ax.plot(x_lm_ls, y_lm_ls, alpha=0.5) ax.set_xlim(xlim) ax.set_ylim(ylim) pylab.xlabel('%s, pvalue=%s, beta=%.3f'%(xlabel, pvalue, beta))
def __init__(self, **keywords): """ 2009-3-20 """ Association.__init__(self, **keywords)