Esempio n. 1
0
    def inputNodePrepare(self, snp_info=None):
        """
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

        picklef = open(self.snps_context_fname)
        snps_context_wrapper = cPickle.load(picklef)
        del picklef
        gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
        del snps_context_wrapper
        gene_id_ls = gene_id2snps_id_ls.keys()
        gene_id_ls.sort()

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
        phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(
            phenData)  #2009-2-16

        self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
            phenData, Set(self.phenotype_method_id_ls))

        if not self.phenotype_index_ls:
            self.phenotype_index_ls = range(len(phenData.col_id_ls))

        pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
            phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        params_ls = self.generate_params(self.gene_id_fname, pdata,
                                         self.block_size)

        other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
              phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        other_data_pickle = cPickle.dumps(other_data, -1)
        del other_data

        output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
              phenotype_index_ls=self.phenotype_index_ls)
        output_node_data_pickle = cPickle.dumps(output_node_data, -1)

        snpData_pickle = cPickle.dumps(snpData, -1)
        del snpData, data_matrix
        return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
              output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
        return return_data
Esempio n. 2
0
	def inputNodePrepare(self, snp_info=None):
		"""
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
						data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
		
		picklef = open(self.snps_context_fname)
		snps_context_wrapper = cPickle.load(picklef)
		del picklef
		gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
		del snps_context_wrapper
		gene_id_ls = gene_id2snps_id_ls.keys()
		gene_id_ls.sort()
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
		phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData)	#2009-2-16
		
		self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls))
		
		if not self.phenotype_index_ls:
			self.phenotype_index_ls = range(len(phenData.col_id_ls))
		
		pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
						phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size)
		
		other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
								phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		other_data_pickle = cPickle.dumps(other_data, -1)
		del other_data
		
		output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
								phenotype_index_ls=self.phenotype_index_ls)
		output_node_data_pickle = cPickle.dumps(output_node_data, -1)
		
		snpData_pickle = cPickle.dumps(snpData, -1)
		del snpData, data_matrix
		return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
								output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
		return return_data
Esempio n. 3
0
	def KWOnBooleanSNP(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \
					min_data_point=3):
		"""
		2009-2-8
			refactor out of computing_node_handler()
			kruskal wallis on a boolean-merged SNP
		"""
		return_ls = []
		for bool_type in bool_type2merge_oper:
			merge_oper_matrix = bool_type2merge_oper[bool_type]
			genotype_ls = cls.mergeTwoGenotypeLs(genotype_ls1, genotype_ls2, merge_oper_matrix)
			pdata = Kruskal_Wallis._kruskal_wallis(genotype_ls, phenotype_ls, min_data_point)
			if pdata:
				pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\
								count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=bool_type, phenotype_index=phenotype_index)
				return_ls.append(pdata)
		return return_ls
Esempio n. 4
0
    def KWOnBooleanSNP(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \
        min_data_point=3):
        """
		2009-2-8
			refactor out of computing_node_handler()
			kruskal wallis on a boolean-merged SNP
		"""
        return_ls = []
        for bool_type in bool_type2merge_oper:
            merge_oper_matrix = bool_type2merge_oper[bool_type]
            genotype_ls = cls.mergeTwoGenotypeLs(genotype_ls1, genotype_ls2,
                                                 merge_oper_matrix)
            pdata = Kruskal_Wallis._kruskal_wallis(genotype_ls, phenotype_ls,
                                                   min_data_point)
            if pdata:
                pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\
                    count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=bool_type, phenotype_index=phenotype_index)
                return_ls.append(pdata)
        return return_ls
Esempio n. 5
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index1 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id1]))[0]
		phenotype_col_index2 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id2]))[0]
		
		x_ls = []
		y_ls = []
		for i in range(phenData.data_matrix.shape[0]):
			if not numpy.isnan(phenData.data_matrix[i][phenotype_col_index1]) and not numpy.isnan(phenData.data_matrix[i][phenotype_col_index2]):
				x_ls.append(phenData.data_matrix[i][phenotype_col_index1])
				y_ls.append(phenData.data_matrix[i][phenotype_col_index2])
		
		pylab.clf()
		pylab.title('Phenotype Contrast')
		pylab.plot(x_ls, y_ls, '.', alpha=0.6)
		pylab.grid(alpha=0.3)
		phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id1)
		phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id2)
		pylab.xlabel(phenotype_method1.short_name)
		pylab.ylabel(phenotype_method2.short_name)
		
		#draw diagonal line to show perfect correlation
		max_min_value = max(min(x_ls), min(y_ls))
		min_max_value = min(max(x_ls), max(y_ls))
		pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
Esempio n. 6
0
	def run(self):
		"""
		2008-09-06
		"""
		if self.debug:
			#for one-node testing purpose
			import pdb
			pdb.set_trace()
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
			
			picklef = open(self.snps_context_fname)
			snps_context_wrapper = cPickle.load(picklef)
			del picklef
			gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
			gene_id_ls = gene_id2snps_id_ls.keys()
			gene_id_ls.sort()
			
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
			phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
			
			other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData)
			other_data_pickle = cPickle.dumps(other_data, -1)
			phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1)
			snpData_pickle = cPickle.dumps(snpData, -1)
			sys.exit(2)
		
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = Set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		if node_rank == 0:
			dstruc = self.inputNodePrepare()
			params_ls = dstruc.params_ls
			#send the output node the phenotype_label_ls
			self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0)
			del dstruc.output_node_data_pickle
			
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(dstruc.snpData_pickle, node, 0)
				self.communicator.send(dstruc.other_data_pickle, node, 0)
				sys.stderr.write(".\n")
			del dstruc
			
		elif node_rank in free_computing_node_set:
			data, source, tag = self.communicator.receiveString(0, 0)
			snpData =  cPickle.loads(data)
			del data
			data, source, tag = self.communicator.receiveString(0, 0)
			other_data = cPickle.loads(data)
			del data
			self.phenotype_index_ls = other_data.phenotype_index_ls
		else:
			data, source, tag = self.communicator.receiveString(0, 0)
			output_node_data_pickle = cPickle.loads(data)
			phenotype_label_ls = output_node_data_pickle.phenotype_label_ls
			self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls
			
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls)
			#self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1)
		elif node_rank in free_computing_node_set:
			computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \
												gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData,
												phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point,
												test_type=self.test_type)
			self.computing_node(computing_parameter_obj, self.computing_node_handler)
		else:
			self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes)
		self.synchronize()	#to avoid some node early exits
Esempio n. 7
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1)
		
		
		if self.eigen_vector_fname and self.eigen_value_fname:
			eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname)
			eigen_value_ls = numpy.array(eigen_value_ls)
			explained_var = eigen_value_ls/numpy.sum(eigen_value_ls)
			PC_data = self.getPCFromFile(self.eigen_vector_fname)
			PC_matrix = PC_data.PC_matrix
		else:
			max_no_of_snps = 10000
			if len(snpData.col_id_ls)>max_no_of_snps:	#2008-12-01 randomly pick max_no_of_snps SNPs
				picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps)
				new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls]
				newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\
								category_list=snpData.category_list)
				newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls]
				snpData = newSnpData
		
			snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index()
			explained_var = None
			PC_matrix = None
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
		
		
		ecotype_info = getEcotypeInfo(db, self.country_order_type)
		
		#the offset below decides where the label of strains/snps should start in axe_snp_matrix
		#2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here.
		snp_id_label_y_offset = 0.95
		StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix)
		
		axe_y_offset1 = 0.03
		axe_height1 = 0.45	#height of axe_chromosome, twice height of axe_map_phenotype_legend
		axe_y_offset2 = axe_y_offset1+axe_height1
		axe_height2 = 0.5	#height of axe_strain_pca, axe_snp_matrix, axe_map
		axe_y_offset3 = axe_y_offset2+axe_height2
		
		axe_x_offset1 = 0.05
		axe_width1 = 0.8	#width of axe_strain_pca
		axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1
		axe_width2 = 0.05	#width of axe_chromosome, axe_snp_matrix, axe_snp_pca
		axe_x_offset3 = axe_x_offset2 + axe_width2
		axe_width3 = 0.02	#width of axe_phenotype
		
		phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id)
		
		phenotype_cmap = mpl.cm.jet
		max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index])	#nanmax ignores the nan elements
		min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index])	#nanmin ignores the nan elements
		phenotype_gap = max_phenotype - min_phenotype
		phenotype_jitter = phenotype_gap/10.
		phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter)
		axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False)
		cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap,
									norm=phenotype_norm,
									orientation='vertical')
		cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name))
		
		axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False)
		axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False)
		axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \
											sharex=axe_strain_pca)	#cover both axe_strain_map and axe_strain_pca
		axe_strain_map_pca_cover.set_yticks([])
		axe_strain_pca_xlim = [-0.05,1.05]
		axe_strain_pca_ylim = [0, 1.05]
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1]	#set it accordingly
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
				
		axe_strain_pca.grid(True, alpha=0.3)
		axe_strain_pca.set_xticks([])
		axe_strain_pca.set_yticks([])
		axe_strain_pca_legend = None	#no pca legend
		self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \
						ecotype_info, phenData, \
					phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\
					strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\
					draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\
					map_pca_line_alpha=0.2, map_pca_linewidth=0.2)	#customize a couple of things
		
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
		
		self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, 
						self.output_fname_prefix, commit=self.commit)
Esempio n. 8
0
    def run(self):
        """
		2008-12-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.phenotype_fname and self.phenotype_method_id:
            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=newSnpData.strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                newSnpData.row_id_ls, strain_acc_list_phen,
                phenData.data_matrix)  #tricky, using strain_acc_list_phen

            phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
                phenData, Set([self.phenotype_method_id]))[0]
            phenotype_label = phenData.col_id_ls[phenotype_col_index]
            phenotype_f = open(
                '%s_%s.pheno' %
                (self.output_fname_prefix, phenotype_label.replace('/', '_')),
                'w')
            for phenotype_value in phenData.data_matrix[:,
                                                        phenotype_col_index]:
                if self.phenotype_is_binary:  #binary and non-binary have different NA designator
                    if numpy.isnan(phenotype_value):
                        phenotype_value = 9
                    else:
                        phenotype_value = int(phenotype_value)
                else:
                    if numpy.isnan(phenotype_value):
                        phenotype_value = -100.0
                phenotype_f.write('%s\n' % phenotype_value)
            del phenotype_f

        genotype_f = open('%s.geno' % self.output_fname_prefix, 'w')
        ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'),
                                delimiter='\t')
        snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'),
                                delimiter='\t')

        #transpose it
        newSnpData = transposeSNPData(newSnpData)

        no_of_rows = len(newSnpData.data_matrix)
        no_of_cols = len(newSnpData.data_matrix[0])
        for i in range(no_of_rows):
            snp_id = newSnpData.row_id_ls[i]
            chr, pos = snp_id.split('_')
            allele1 = allele_index2allele_ls[i][0]  #major allele
            allele2 = allele_index2allele_ls[i][1]  #minor allele
            snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
            geno_line = ''
            for j in range(no_of_cols):
                if i == 0:  #write out the accessions
                    ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
                allele = newSnpData.data_matrix[i][j]
                if allele == 0:
                    geno_line += '0'
                elif allele == 1:
                    geno_line += '2'
                else:
                    geno_line += '9'
            geno_line += '\n'
            genotype_f.write(geno_line)

        del genotype_f, ind_writer, snp_writer
Esempio n. 9
0
	def run(self):
		"""
		2008-12-02
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		if self.array_id_2nd_column:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		else:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
							data_matrix=data_matrix)	#ignore category_list
		
		newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report)
		
		if self.phenotype_fname and self.phenotype_method_id:
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
			phenData = SNPData(header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
			
			phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
			phenotype_label = phenData.col_id_ls[phenotype_col_index]
			phenotype_f = open('%s_%s.pheno'%(self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w')
			for phenotype_value in phenData.data_matrix[:,phenotype_col_index]:
				if self.phenotype_is_binary:	#binary and non-binary have different NA designator
					if numpy.isnan(phenotype_value):
						phenotype_value = 9
					else:
						phenotype_value = int(phenotype_value)
				else:
					if numpy.isnan(phenotype_value):
						phenotype_value = -100.0
				phenotype_f.write('%s\n'%phenotype_value)
			del phenotype_f
		
		genotype_f = open('%s.geno'%self.output_fname_prefix, 'w')
		ind_writer = csv.writer(open('%s.ind'%self.output_fname_prefix, 'w'), delimiter='\t')
		snp_writer = csv.writer(open('%s.snp'%self.output_fname_prefix, 'w'), delimiter='\t')
		
		#transpose it
		newSnpData = transposeSNPData(newSnpData)
		
		no_of_rows = len(newSnpData.data_matrix)
		no_of_cols = len(newSnpData.data_matrix[0])
		for i in range(no_of_rows):
			snp_id = newSnpData.row_id_ls[i]
			chr, pos = snp_id.split('_')
			allele1 = allele_index2allele_ls[i][0]	#major allele
			allele2 = allele_index2allele_ls[i][1]	#minor allele
			snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
			geno_line = ''
			for j in range(no_of_cols):
				if i==0:	#write out the accessions
					ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
				allele = newSnpData.data_matrix[i][j]
				if allele==0:
					geno_line += '0'
				elif allele==1:
					geno_line += '2'
				else:
					geno_line += '9'
			geno_line += '\n'
			genotype_f.write(geno_line)
		
		del genotype_f, ind_writer, snp_writer
Esempio n. 10
0
    def run(self):
        """
		2008-09-06
		"""
        if self.debug:
            #for one-node testing purpose
            import pdb
            pdb.set_trace()
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

            picklef = open(self.snps_context_fname)
            snps_context_wrapper = cPickle.load(picklef)
            del picklef
            gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(
                snps_context_wrapper)
            gene_id_ls = gene_id2snps_id_ls.keys()
            gene_id_ls.sort()

            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)

            other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls,
                                     gene_id_ls=gene_id_ls,
                                     phenData=phenData)
            other_data_pickle = cPickle.dumps(other_data, -1)
            phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1)
            snpData_pickle = cPickle.dumps(snpData, -1)
            sys.exit(2)

        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1

        if node_rank == 0:
            dstruc = self.inputNodePrepare()
            params_ls = dstruc.params_ls
            #send the output node the phenotype_label_ls
            self.communicator.send(dstruc.output_node_data_pickle,
                                   output_node_rank, 0)
            del dstruc.output_node_data_pickle

            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(dstruc.snpData_pickle, node, 0)
                self.communicator.send(dstruc.other_data_pickle, node, 0)
                sys.stderr.write(".\n")
            del dstruc

        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            snpData = cPickle.loads(data)
            del data
            data, source, tag = self.communicator.receiveString(0, 0)
            other_data = cPickle.loads(data)
            del data
            self.phenotype_index_ls = other_data.phenotype_index_ls
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            output_node_data_pickle = cPickle.loads(data)
            phenotype_label_ls = output_node_data_pickle.phenotype_label_ls
            self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(params_ls=params_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=params_ls)
            #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \
                     gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData,
                     phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point,
                     test_type=self.test_type)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler)
        else:
            self.general_output_node(self.output_dir, self.phenotype_index_ls,
                                     phenotype_label_ls, free_computing_nodes)
        self.synchronize()  #to avoid some node early exits
Esempio n. 11
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup(create_tables=False)
        session = db.session

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(
            header=header_phen,
            strain_acc_list=strain_acc_list_phen,
            data_matrix=data_matrix_phen
        )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            phenData.row_id_ls, strain_acc_list_phen,
            phenData.data_matrix)  #tricky, using strain_acc_list_phen

        phenotype_col_index1 = self.findOutWhichPhenotypeColumn(
            phenData, Set([self.phenotype_method_id1]))[0]
        phenotype_col_index2 = self.findOutWhichPhenotypeColumn(
            phenData, Set([self.phenotype_method_id2]))[0]

        x_ls = []
        y_ls = []
        for i in range(phenData.data_matrix.shape[0]):
            if not numpy.isnan(
                    phenData.data_matrix[i]
                [phenotype_col_index1]) and not numpy.isnan(
                    phenData.data_matrix[i][phenotype_col_index2]):
                x_ls.append(phenData.data_matrix[i][phenotype_col_index1])
                y_ls.append(phenData.data_matrix[i][phenotype_col_index2])

        pylab.clf()
        pylab.title('Phenotype Contrast')
        pylab.plot(x_ls, y_ls, '.', alpha=0.6)
        pylab.grid(alpha=0.3)
        phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(
            self.phenotype_method_id1)
        phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(
            self.phenotype_method_id2)
        pylab.xlabel(phenotype_method1.short_name)
        pylab.ylabel(phenotype_method2.short_name)

        #draw diagonal line to show perfect correlation
        max_min_value = max(min(x_ls), min(y_ls))
        min_max_value = min(max(x_ls), max(y_ls))
        pylab.plot([max_min_value, min_max_value],
                   [max_min_value, min_max_value],
                   c='g',
                   alpha=0.7)

        png_output_fname = '%s.png' % self.output_fname_prefix
        pylab.savefig(png_output_fname, dpi=400)
        pylab.savefig('%s.svg' % self.output_fname_prefix)