def readInData( cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0 ): """ 2010-2-25 call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes 2009-3-20 refactored out of run(), easy for MpiAssociation.py to call """ header, strain_acc_list, category_list, data_matrix = read_data(input_fname) snpData = SNPData( header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix ) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( phenotype_fname, turn_into_integer=0 ) snpData = cls.removeUnPhenotypedSNPData( snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls ) newSnpData, allele2index_ls = snpData.convertSNPAllele2Index( report ) # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele newSnpData.header = snpData.header data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order( strain_acc_list, strain_acc_list_phen, data_matrix_phen ) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) if eigen_vector_fname: PC_data = cls.getPCFromFile(eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: if test_type == 4: # eigen_vector_fname not given for this test_type. calcualte PCs. import pca_module T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False) PC_matrix = T else: PC_matrix = None del snpData if phenotype_method_id_ls: which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls)) else: # if not available, take all phenotypes which_phenotype_ls = range(len(phenData.col_id_ls)) pdata = PassingData( snpData=newSnpData, phenData=phenData, PC_matrix=PC_matrix, which_phenotype_ls=which_phenotype_ls, phenotype_method_id_ls=phenotype_method_id_ls, ) return pdata
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1) if self.eigen_vector_fname and self.eigen_value_fname: eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname) eigen_value_ls = numpy.array(eigen_value_ls) explained_var = eigen_value_ls/numpy.sum(eigen_value_ls) PC_data = self.getPCFromFile(self.eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: max_no_of_snps = 10000 if len(snpData.col_id_ls)>max_no_of_snps: #2008-12-01 randomly pick max_no_of_snps SNPs picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps) new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls] newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\ category_list=snpData.category_list) newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls] snpData = newSnpData snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index() explained_var = None PC_matrix = None header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] ecotype_info = getEcotypeInfo(db, self.country_order_type) #the offset below decides where the label of strains/snps should start in axe_snp_matrix #2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here. snp_id_label_y_offset = 0.95 StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix) axe_y_offset1 = 0.03 axe_height1 = 0.45 #height of axe_chromosome, twice height of axe_map_phenotype_legend axe_y_offset2 = axe_y_offset1+axe_height1 axe_height2 = 0.5 #height of axe_strain_pca, axe_snp_matrix, axe_map axe_y_offset3 = axe_y_offset2+axe_height2 axe_x_offset1 = 0.05 axe_width1 = 0.8 #width of axe_strain_pca axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1 axe_width2 = 0.05 #width of axe_chromosome, axe_snp_matrix, axe_snp_pca axe_x_offset3 = axe_x_offset2 + axe_width2 axe_width3 = 0.02 #width of axe_phenotype phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id) phenotype_cmap = mpl.cm.jet max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index]) #nanmax ignores the nan elements min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index]) #nanmin ignores the nan elements phenotype_gap = max_phenotype - min_phenotype phenotype_jitter = phenotype_gap/10. phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter) axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False) cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap, norm=phenotype_norm, orientation='vertical') cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name)) axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False) axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False) axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \ sharex=axe_strain_pca) #cover both axe_strain_map and axe_strain_pca axe_strain_map_pca_cover.set_yticks([]) axe_strain_pca_xlim = [-0.05,1.05] axe_strain_pca_ylim = [0, 1.05] axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1] #set it accordingly axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) axe_strain_pca.grid(True, alpha=0.3) axe_strain_pca.set_xticks([]) axe_strain_pca.set_yticks([]) axe_strain_pca_legend = None #no pca legend self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \ ecotype_info, phenData, \ phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\ strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\ draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\ map_pca_line_alpha=0.2, map_pca_linewidth=0.2) #customize a couple of things axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix) self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, self.output_fname_prefix, commit=self.commit)