Beispiel #1
0
    def readInData(
        cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0
    ):
        """
		2010-2-25
			call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes
		2009-3-20
			refactored out of run(), easy for MpiAssociation.py to call
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(input_fname)
        snpData = SNPData(
            header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix
        )

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            phenotype_fname, turn_into_integer=0
        )
        snpData = cls.removeUnPhenotypedSNPData(
            snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls
        )

        newSnpData, allele2index_ls = snpData.convertSNPAllele2Index(
            report
        )  # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele
        newSnpData.header = snpData.header

        data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order(
            strain_acc_list, strain_acc_list_phen, data_matrix_phen
        )
        phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)

        if eigen_vector_fname:
            PC_data = cls.getPCFromFile(eigen_vector_fname)
            PC_matrix = PC_data.PC_matrix
        else:
            if test_type == 4:  # eigen_vector_fname not given for this test_type. calcualte PCs.
                import pca_module

                T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False)
                PC_matrix = T
            else:
                PC_matrix = None

        del snpData
        if phenotype_method_id_ls:
            which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls))
        else:  # if not available, take all phenotypes
            which_phenotype_ls = range(len(phenData.col_id_ls))
        pdata = PassingData(
            snpData=newSnpData,
            phenData=phenData,
            PC_matrix=PC_matrix,
            which_phenotype_ls=which_phenotype_ls,
            phenotype_method_id_ls=phenotype_method_id_ls,
        )
        return pdata
Beispiel #2
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1)
		
		
		if self.eigen_vector_fname and self.eigen_value_fname:
			eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname)
			eigen_value_ls = numpy.array(eigen_value_ls)
			explained_var = eigen_value_ls/numpy.sum(eigen_value_ls)
			PC_data = self.getPCFromFile(self.eigen_vector_fname)
			PC_matrix = PC_data.PC_matrix
		else:
			max_no_of_snps = 10000
			if len(snpData.col_id_ls)>max_no_of_snps:	#2008-12-01 randomly pick max_no_of_snps SNPs
				picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps)
				new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls]
				newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\
								category_list=snpData.category_list)
				newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls]
				snpData = newSnpData
		
			snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index()
			explained_var = None
			PC_matrix = None
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
		
		
		ecotype_info = getEcotypeInfo(db, self.country_order_type)
		
		#the offset below decides where the label of strains/snps should start in axe_snp_matrix
		#2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here.
		snp_id_label_y_offset = 0.95
		StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix)
		
		axe_y_offset1 = 0.03
		axe_height1 = 0.45	#height of axe_chromosome, twice height of axe_map_phenotype_legend
		axe_y_offset2 = axe_y_offset1+axe_height1
		axe_height2 = 0.5	#height of axe_strain_pca, axe_snp_matrix, axe_map
		axe_y_offset3 = axe_y_offset2+axe_height2
		
		axe_x_offset1 = 0.05
		axe_width1 = 0.8	#width of axe_strain_pca
		axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1
		axe_width2 = 0.05	#width of axe_chromosome, axe_snp_matrix, axe_snp_pca
		axe_x_offset3 = axe_x_offset2 + axe_width2
		axe_width3 = 0.02	#width of axe_phenotype
		
		phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id)
		
		phenotype_cmap = mpl.cm.jet
		max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index])	#nanmax ignores the nan elements
		min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index])	#nanmin ignores the nan elements
		phenotype_gap = max_phenotype - min_phenotype
		phenotype_jitter = phenotype_gap/10.
		phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter)
		axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False)
		cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap,
									norm=phenotype_norm,
									orientation='vertical')
		cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name))
		
		axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False)
		axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False)
		axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \
											sharex=axe_strain_pca)	#cover both axe_strain_map and axe_strain_pca
		axe_strain_map_pca_cover.set_yticks([])
		axe_strain_pca_xlim = [-0.05,1.05]
		axe_strain_pca_ylim = [0, 1.05]
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1]	#set it accordingly
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
				
		axe_strain_pca.grid(True, alpha=0.3)
		axe_strain_pca.set_xticks([])
		axe_strain_pca.set_yticks([])
		axe_strain_pca_legend = None	#no pca legend
		self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \
						ecotype_info, phenData, \
					phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\
					strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\
					draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\
					map_pca_line_alpha=0.2, map_pca_linewidth=0.2)	#customize a couple of things
		
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
		
		self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, 
						self.output_fname_prefix, commit=self.commit)