def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData( phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def KWOnBooleanSNP(cls, snp1_id, gene1_id, genotype_ls1, snp2_id, gene2_id, genotype_ls2, phenotype_index, phenotype_ls, \ min_data_point=3): """ 2009-2-8 refactor out of computing_node_handler() kruskal wallis on a boolean-merged SNP """ return_ls = [] for bool_type in bool_type2merge_oper: merge_oper_matrix = bool_type2merge_oper[bool_type] genotype_ls = cls.mergeTwoGenotypeLs(genotype_ls1, genotype_ls2, merge_oper_matrix) pdata = Kruskal_Wallis._kruskal_wallis(genotype_ls, phenotype_ls, min_data_point) if pdata: pdata = PassingData(snp1_id=snp1_id, gene1_id=gene1_id, snp2_id=snp2_id, gene2_id=gene2_id, pvalue=pdata.pvalue,\ count1=pdata.count_ls[0], count2=pdata.count_ls[1], bool_type=bool_type, phenotype_index=phenotype_index) return_ls.append(pdata) return return_ls
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index1 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id1]))[0] phenotype_col_index2 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id2]))[0] x_ls = [] y_ls = [] for i in range(phenData.data_matrix.shape[0]): if not numpy.isnan(phenData.data_matrix[i][phenotype_col_index1]) and not numpy.isnan(phenData.data_matrix[i][phenotype_col_index2]): x_ls.append(phenData.data_matrix[i][phenotype_col_index1]) y_ls.append(phenData.data_matrix[i][phenotype_col_index2]) pylab.clf() pylab.title('Phenotype Contrast') pylab.plot(x_ls, y_ls, '.', alpha=0.6) pylab.grid(alpha=0.3) phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id1) phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id2) pylab.xlabel(phenotype_method1.short_name) pylab.ylabel(phenotype_method2.short_name) #draw diagonal line to show perfect correlation max_min_value = max(min(x_ls), min(y_ls)) min_max_value = min(max(x_ls), max(y_ls)) pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix)
def run(self): """ 2008-09-06 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData) other_data_pickle = cPickle.dumps(other_data, -1) phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1) snpData_pickle = cPickle.dumps(snpData, -1) sys.exit(2) self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 if node_rank == 0: dstruc = self.inputNodePrepare() params_ls = dstruc.params_ls #send the output node the phenotype_label_ls self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0) del dstruc.output_node_data_pickle for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(dstruc.snpData_pickle, node, 0) self.communicator.send(dstruc.other_data_pickle, node, 0) sys.stderr.write(".\n") del dstruc elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data data, source, tag = self.communicator.receiveString(0, 0) other_data = cPickle.loads(data) del data self.phenotype_index_ls = other_data.phenotype_index_ls else: data, source, tag = self.communicator.receiveString(0, 0) output_node_data_pickle = cPickle.loads(data) phenotype_label_ls = output_node_data_pickle.phenotype_label_ls self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \ gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData, phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point, test_type=self.test_type) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes) self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1) if self.eigen_vector_fname and self.eigen_value_fname: eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname) eigen_value_ls = numpy.array(eigen_value_ls) explained_var = eigen_value_ls/numpy.sum(eigen_value_ls) PC_data = self.getPCFromFile(self.eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: max_no_of_snps = 10000 if len(snpData.col_id_ls)>max_no_of_snps: #2008-12-01 randomly pick max_no_of_snps SNPs picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps) new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls] newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\ category_list=snpData.category_list) newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls] snpData = newSnpData snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index() explained_var = None PC_matrix = None header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] ecotype_info = getEcotypeInfo(db, self.country_order_type) #the offset below decides where the label of strains/snps should start in axe_snp_matrix #2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here. snp_id_label_y_offset = 0.95 StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix) axe_y_offset1 = 0.03 axe_height1 = 0.45 #height of axe_chromosome, twice height of axe_map_phenotype_legend axe_y_offset2 = axe_y_offset1+axe_height1 axe_height2 = 0.5 #height of axe_strain_pca, axe_snp_matrix, axe_map axe_y_offset3 = axe_y_offset2+axe_height2 axe_x_offset1 = 0.05 axe_width1 = 0.8 #width of axe_strain_pca axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1 axe_width2 = 0.05 #width of axe_chromosome, axe_snp_matrix, axe_snp_pca axe_x_offset3 = axe_x_offset2 + axe_width2 axe_width3 = 0.02 #width of axe_phenotype phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id) phenotype_cmap = mpl.cm.jet max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index]) #nanmax ignores the nan elements min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index]) #nanmin ignores the nan elements phenotype_gap = max_phenotype - min_phenotype phenotype_jitter = phenotype_gap/10. phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter) axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False) cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap, norm=phenotype_norm, orientation='vertical') cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name)) axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False) axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False) axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \ sharex=axe_strain_pca) #cover both axe_strain_map and axe_strain_pca axe_strain_map_pca_cover.set_yticks([]) axe_strain_pca_xlim = [-0.05,1.05] axe_strain_pca_ylim = [0, 1.05] axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1] #set it accordingly axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) axe_strain_pca.grid(True, alpha=0.3) axe_strain_pca.set_xticks([]) axe_strain_pca.set_yticks([]) axe_strain_pca_legend = None #no pca legend self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \ ecotype_info, phenData, \ phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\ strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\ draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\ map_pca_line_alpha=0.2, map_pca_linewidth=0.2) #customize a couple of things axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix) self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, self.output_fname_prefix, commit=self.commit)
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open( '%s_%s.pheno' % (self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:, phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n' % phenotype_value) del phenotype_f genotype_f = open('%s.geno' % self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i == 0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele == 0: geno_line += '0' elif allele == 1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open('%s_%s.pheno'%(self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:,phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n'%phenotype_value) del phenotype_f genotype_f = open('%s.geno'%self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind'%self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp'%self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i==0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele==0: geno_line += '0' elif allele==1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def run(self): """ 2008-09-06 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls( snps_context_wrapper) gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData) other_data_pickle = cPickle.dumps(other_data, -1) phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1) snpData_pickle = cPickle.dumps(snpData, -1) sys.exit(2) self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 if node_rank == 0: dstruc = self.inputNodePrepare() params_ls = dstruc.params_ls #send the output node the phenotype_label_ls self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0) del dstruc.output_node_data_pickle for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(dstruc.snpData_pickle, node, 0) self.communicator.send(dstruc.other_data_pickle, node, 0) sys.stderr.write(".\n") del dstruc elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data data, source, tag = self.communicator.receiveString(0, 0) other_data = cPickle.loads(data) del data self.phenotype_index_ls = other_data.phenotype_index_ls else: data, source, tag = self.communicator.receiveString(0, 0) output_node_data_pickle = cPickle.loads(data) phenotype_label_ls = output_node_data_pickle.phenotype_label_ls self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \ gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData, phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point, test_type=self.test_type) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes) self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index1 = self.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id1]))[0] phenotype_col_index2 = self.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id2]))[0] x_ls = [] y_ls = [] for i in range(phenData.data_matrix.shape[0]): if not numpy.isnan( phenData.data_matrix[i] [phenotype_col_index1]) and not numpy.isnan( phenData.data_matrix[i][phenotype_col_index2]): x_ls.append(phenData.data_matrix[i][phenotype_col_index1]) y_ls.append(phenData.data_matrix[i][phenotype_col_index2]) pylab.clf() pylab.title('Phenotype Contrast') pylab.plot(x_ls, y_ls, '.', alpha=0.6) pylab.grid(alpha=0.3) phenotype_method1 = Stock_250kDB.PhenotypeMethod.get( self.phenotype_method_id1) phenotype_method2 = Stock_250kDB.PhenotypeMethod.get( self.phenotype_method_id2) pylab.xlabel(phenotype_method1.short_name) pylab.ylabel(phenotype_method2.short_name) #draw diagonal line to show perfect correlation max_min_value = max(min(x_ls), min(y_ls)) min_max_value = min(max(x_ls), max(y_ls)) pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7) png_output_fname = '%s.png' % self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg' % self.output_fname_prefix)