def run(self): """ 2008-08-19 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session total_gene_id_ls = get_total_gene_ls(db.metadata.bind) no_of_total_genes = len(total_gene_id_ls) #no_of_total_genes = self.getNoOfTotalGenes(db, self.gene_table, self.tax_id) #if self.commit: # session.begin() _type = self.getTopSNPTestType(self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type,\ self.test_type_id, self.null_distribution_type_id) snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) pd = PassingData(list_type_id=self.list_type_id, snps_context_wrapper=snps_context_wrapper, \ no_of_total_genes=no_of_total_genes, results_directory=self.results_directory, \ min_MAF=self.min_MAF, get_closest=self.get_closest, min_distance=self.min_distance,\ no_of_top_snps=self.no_of_top_snps, min_sample_size=self.min_sample_size, test_type_id=self.test_type_id, \ results_type=self.results_type, no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks, type_id=_type.id,\ null_distribution_type_id=self.null_distribution_type_id,\ allow_two_sample_overlapping=self.allow_two_sample_overlapping, total_gene_id_ls=total_gene_id_ls,\ min_score=self.min_score, commit=self.commit) if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') header_row = [] for column in CandidateGeneTopSNPTest.c.keys(): header_row.append(column) writer.writerow(header_row) else: writer = None #2008-10-31 setting up list accordingly if self.min_score: pd.min_score_ls = [self.min_score] else: pd.no_of_top_snps_ls = [self.no_of_top_snps] for results_id in self.results_id_ls: pd.results_id = results_id #self.runEnrichmentTestToGetNullData(session, pd) return_data = self.runHGTest(pd) result = return_data.result_ls[0] if result is not None: result.type = _type #assign the type here row = [] for column in result.c.keys(): row.append(getattr(result, column)) print '%s: %s'%(column, row[-1]) if writer: writer.writerow(row) session.save(result) if self.commit: session.flush()
def run(self): """ 2008-08-20 """ self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 #2008-10-30 comment out because computing node is gonna save the stuff itself. if node_rank!=output_node_rank: #to reduce the number of connections/queries to the master self.hostname = self.alter_hostname db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session if self.results_type==1: ResultsClass = Stock_250kDB.ResultsMethod TestResultClass = Stock_250kDB.CandidateGeneTopSNPTestRM elif self.results_type==2: ResultsClass = Stock_250kDB.ResultsByGene TestResultClass = Stock_250kDB.CandidateGeneTopSNPTest elif self.results_type==3: ResultsClass = Stock_250kDB.ResultsMethod TestResultClass = Stock_250kDB.CandidateGeneTopSNPTestRG else: sys.stderr.write("Invalid results type : %s.\n"%pd.results_type) sys.exit(3) if node_rank == 0: pdata_for_computing = PassingData() pdata_for_computing.total_gene_id_ls = get_total_gene_ls(db.metadata.bind) pdata_for_computing.no_of_total_genes = len(pdata_for_computing.total_gene_id_ls) param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type) params_ls = self.generate_params(param_obj, self.min_no_of_genes) cutoff_ls = self.generate_cutoff_ls(self.no_of_top_snps, self.min_score, self.rank_gap, self.stop_rank, self.window_with_same_rank_gap) params_ls = self.addCutoffToParamsLs(cutoff_ls, params_ls) pdata_for_computing.snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) if self.debug: params_ls = params_ls[:100] pdata_for_computing_pickle = cPickle.dumps(pdata_for_computing, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(pdata_for_computing_pickle, node, 0) sys.stderr.write(".\n") del pdata_for_computing_pickle del pdata_for_computing elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) data = cPickle.loads(data) sys.stderr.write(".\n") else: pass _type = self.getTopSNPTestType(self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type,\ self.test_type_id, self.null_distribution_type_id) self.synchronize() if node_rank == 0: parameter_list = [params_ls] self.input_node(parameter_list, free_computing_nodes, input_handler=self.input_handler, message_size=self.message_size) elif node_rank in free_computing_node_set: comp_param_obj = PassingData(snps_context_wrapper=data.snps_context_wrapper, \ results_directory=self.results_directory, min_MAF=self.min_MAF,\ no_of_total_genes=data.no_of_total_genes, \ total_gene_id_ls=data.total_gene_id_ls,\ type_id=_type.id, #_type is placeholder. output_node decides on this. session=session) self.computing_node(comp_param_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') header_row = [] for column in TestResultClass.c.keys(): header_row.append(column) writer.writerow(header_row) else: writer = None output_param_obj = PassingData(writer=writer, session=session, commit=self.commit, TestResultClass=TestResultClass, _type=_type) self.output_node(free_computing_nodes, output_param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def pick_candidate_genes(self, pd): """ 2009-4-10 rank is now part of the whole unique constraint in table Stock_250kDB.ResultsGene. 2008-11-12 if entry already exists in ResultsGene, make sure it doesn't have the current type associated with upon failure of 'session.flush()', expunge the current entry and report error. avoid failure of the whole program. 2008-10-28 """ sys.stderr.write("Picking candidates genes from results (id=%s) for list_type (id=%s) ..."%(pd.results_id, pd.list_type_id)) rm = Stock_250kDB.ResultsMethod.get(pd.results_id) session = getattr(pd, 'session', None) if session is None: sys.stderr.write("session is None. no db connection.\n") return None if pd.list_type_id==0: total_gene_id_ls = get_total_gene_ls(Stock_250kDB.ResultsMethod.table.bind) candidate_gene_set = Set(total_gene_id_ls) else: candidate_gene_set = self.dealWithCandidateGeneList(pd.list_type_id, return_set=True) #internal cache pd.construct_data_obj_id2index = False #default in getResultMethodContent is True pd.construct_chr_pos2index = False #no need for this as well pd.need_candidate_association = True pd.need_snp_index = False pd.candidate_gene_set = candidate_gene_set return_data = self.prepareDataForPermutationRankTest(rm, pd.snps_context_wrapper, pd) counter = 0 for snps_id, disp_pos, gene_id, score, rank in return_data.candidate_association_ls: rows = Stock_250kDB.ResultsGene.query.filter_by(snps_id=snps_id).\ filter_by(gene_id=gene_id).\ filter_by(results_id=pd.results_id).filter_by(rank=rank) if rows.count()==1: row = rows.first() already_in_db = 0 if pd.type.id: #2008-11-12 for hist_type in row.types: if hist_type.id==pd.type.id: already_in_db = 1 break if not already_in_db: #2008-11-12 row.types.append(pd.type) session.save_or_update(row) counter += 1 elif rows.count()>1: sys.stderr.write("Error: more than 1 db entries with snps_id=%s, gene_id=%s, results_id=%s.\n"%\ (snps_id, gene_id, results_id)) continue else: row = Stock_250kDB.ResultsGene(snps_id=snps_id, gene_id=gene_id, disp_pos=disp_pos,\ results_id=pd.results_id, score=score, rank=rank) row.types.append(pd.type) session.save_or_update(row) counter += 1 if pd.commit: try: #2008-11-12 don't wanna db failure to bog down the whole program session.flush() except: session.expunge(row) for column in row.c.keys(): sys.stderr.write("\t%s=%s.\n"%(column, getattr(row, column))) traceback.print_exc() sys.stderr.write('%s.\n'%repr(sys.exc_info())) sys.stderr.write("%s entries saved. Done.\n"%counter)
def pick_candidate_genes(self, pd): """ 2010-4-19 report the total count as well. 2009-4-10 rank is now part of the whole unique constraint in table Stock_250kDB.ResultsGene. 2008-11-12 if entry already exists in ResultsGene, make sure it doesn't have the current type associated with upon failure of 'session.flush()', expunge the current entry and report error. avoid failure of the whole program. 2008-10-28 """ sys.stderr.write("Picking candidates genes from results (id=%s) for list_type (id=%s) ..."%\ (pd.results_id, pd.list_type_id)) rm = Stock_250kDB.ResultsMethod.get(pd.results_id) session = getattr(pd, 'session', None) if session is None: sys.stderr.write("session is None. no db connection.\n") return None if pd.list_type_id==0: total_gene_id_ls = get_total_gene_ls(Stock_250kDB.ResultsMethod.table.bind) candidate_gene_set = Set(total_gene_id_ls) else: candidate_gene_set = self.dealWithCandidateGeneList(pd.list_type_id, return_set=True) #internal cache pd.construct_data_obj_id2index = False #default in getResultMethodContent is True pd.construct_chr_pos2index = False #no need for this as well pd.need_candidate_association = True pd.need_snp_index = False pd.candidate_gene_set = candidate_gene_set return_data = self.prepareDataForPermutationRankTest(rm, pd.snps_context_wrapper, pd) counter = 0 total_count = 0 for snps_id, disp_pos, gene_id, score, rank in return_data.candidate_association_ls: total_count += 1 rows = Stock_250kDB.ResultsGene.query.filter_by(snps_id=snps_id).\ filter_by(gene_id=gene_id).\ filter_by(results_id=pd.results_id).filter_by(rank=rank) if rows.count()==1: row = rows.first() already_in_db = 0 if pd.type.id: #2008-11-12 for hist_type in row.types: if hist_type.id==pd.type.id: already_in_db = 1 break if not already_in_db: #2008-11-12 row.types.append(pd.type) session.save_or_update(row) counter += 1 elif rows.count()>1: if self.report: sys.stderr.write("Error: more than 1 db entries with snps_id=%s, gene_id=%s, results_id=%s.\n"%\ (snps_id, gene_id, results_id)) continue else: row = Stock_250kDB.ResultsGene(snps_id=snps_id, gene_id=gene_id, disp_pos=disp_pos,\ results_id=pd.results_id, score=score, rank=rank) row.types.append(pd.type) session.add(row) counter += 1 if pd.commit: try: #2008-11-12 don't wanna db failure to bog down the whole program session.flush() except: session.expunge(row) for column in row.c.keys(): sys.stderr.write("\t%s=%s.\n"%(column, getattr(row, column))) traceback.print_exc() sys.stderr.write('%s.\n'%repr(sys.exc_info())) sys.stderr.write("%s out of %s entries saved. Done.\n"%(counter, total_count))
def run(self): """ 2008-12-08 if the plot under configuration is already in db, abort only if the program is gonna commit the database transaction. 2008-10-19 save figures in database if commit """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session #session.begin() if self.results_type==1: ResultsClass = Stock_250kDB.ResultsMethod snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) elif self.results_type==2: ResultsClass = Stock_250kDB.ResultsByGene else: sys.stderr.write("Invalid results type : %s.\n"%self.results_type) return None hist_type = self.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id) candidate_gene_list = self.getGeneList(self.list_type_id) if len(candidate_gene_list)<self.min_sample_size: sys.stderr.write("Candidate gene list of %s too small: %s.\n"%(self.list_type_id, len(candidate_gene_list))) sys.exit(4) #candidate_gene_list = [] #2009-01-12 just to plot the histogram of pvalue candidate_gene_set = Set(candidate_gene_list) list_type = Stock_250kDB.GeneListType.get(self.list_type_id) if list_type is None: sys.exit(3) phenotype_id2results_id_ls = self.getResultsIDLs(db, ResultsClass, self.results_type, self.phenotype_id_ls, \ self.min_distance, self.get_closest, self.min_MAF, self.call_method_id) param_data = PassingData(results_directory=self.results_directory, candidate_gene_list=candidate_gene_list, \ min_MAF=self.min_MAF, allow_two_sample_overlapping=self.allow_two_sample_overlapping, need_the_value=1, \ do_log10_transformation=False) #need_the_value means to get the pvalue/score #force no log10 transformation. otherwise, transformation based on analysis_method if self.null_distribution_type_id==2 or self.null_distribution_type_id==3: #gw-looping or random gene list snp_info = DrawSNPRegion.getSNPInfo(db) candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, snp_info.chr_pos_ls, snps_context_wrapper) no_of_snps = len(snp_info.chr_pos_ls) no_of_permutations = no_of_snps/len(candidate_gene_snp_index_ls) + 1 param_data.chr_pos2index = snp_info.chr_pos2index #pass to getGenomeWideResultFromFile if self.null_distribution_type_id==2: non_candidate_gene_snp_index_ls = self.get_non_candidate_gene_snp_index_ls_by_permutation(candidate_gene_snp_index_ls, no_of_snps, no_of_permutations) elif self.null_distribution_type_id == 3: gene_id_ls = get_total_gene_ls(db.metadata.bind) no_of_candidate_genes = len(candidate_gene_set) non_candidate_gene_snp_index_ls = numpy.zeros(0, numpy.int) while len(non_candidate_gene_snp_index_ls)<no_of_snps: non_candidate_gene_set = Set(random.sample(gene_id_ls, no_of_candidate_genes)) _non_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(non_candidate_gene_set, snp_info.chr_pos_ls, snps_context_wrapper) non_candidate_gene_snp_index_ls = numpy.hstack((non_candidate_gene_snp_index_ls, _non_candidate_gene_snp_index_ls)) for phenotype_id, results_id_ls in phenotype_id2results_id_ls.iteritems(): if hist_type.id: #hist_type already in database rows = Stock_250kDB.ScoreRankHistogram.query.filter_by(phenotype_method_id=phenotype_id).\ filter_by(list_type_id=self.list_type_id).filter_by(hist_type_id=hist_type.id) if rows.count()>0 and self.commit: #2008-12-08 only skip if the database transaction is gonna commit. row = rows.first() sys.stderr.write("Histogram already in database. id=%s, phenotype_id=%s, list_type_id=%s, hist_type_id=%s.\n"%\ (row.id, row.phenotype_method_id, row.list_type_id, row.hist_type_id)) continue phenotype_method = Stock_250kDB.PhenotypeMethod.get(phenotype_id) if not phenotype_method: continue score_rank_data_ls = [] sys.stderr.write("Checking phenotype %s (%s) on list_type %s (%s) ...\n"%\ (phenotype_method.id, phenotype_method.short_name, list_type.id, list_type.short_name)) for results_id in results_id_ls: try: rm = ResultsClass.get(results_id) score_rank_data = None if self.null_distribution_type_id==1: if self.results_type==1: permData = self.prepareDataForPermutationRankTest(rm, snps_context_wrapper, param_data) if not permData: continue score_rank_data = PassingData(candidate_score_ls=permData.candidate_gene_snp_value_ls, \ candidate_rank_ls=permData.candidate_gene_snp_rank_ls,\ non_candidate_score_ls=permData.non_candidate_gene_snp_value_ls, non_candidate_rank_ls=permData.non_candidate_gene_snp_rank_ls,\ analysis_method=rm.analysis_method) del permData elif self.results_type==2: score_rank_data = self.getScoreRankFromRBG(rm, candidate_gene_set, self.results_directory) elif self.null_distribution_type_id==2 or self.null_distribution_type_id==3: genome_wide_result = self.getResultMethodContent(rm, param_data.results_directory, param_data.min_MAF, pdata=param_data) if not genome_wide_result: continue score_rank_data = self.getScoreRankFromPermIndexLs(genome_wide_result, candidate_gene_snp_index_ls, non_candidate_gene_snp_index_ls) if score_rank_data: score_rank_data.analysis_method = rm.analysis_method if score_rank_data: score_rank_data_ls.append(score_rank_data) except: sys.stderr.write("Exception happened for results_id=%s, phenotype_id=%s.\n"%(results_id, phenotype_id)) traceback.print_exc() sys.stderr.write('%s.\n'%repr(sys.exc_info())) continue if score_rank_data_ls: score_png_data, score_svg_data = self.plotHistForOnePhenotype(phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='score', commit=self.commit) rank_png_data, rank_svg_data = self.plotHistForOnePhenotype(phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='rank', commit=self.commit) if self.commit: score_rank_hist = Stock_250kDB.ScoreRankHistogram(phenotype_method_id=phenotype_id, list_type_id=list_type.id) score_rank_hist.hist_type = hist_type score_rank_hist.score_hist = score_png_data.getvalue() score_rank_hist.score_hist_svg = score_svg_data.getvalue() score_rank_hist.rank_hist = rank_png_data.getvalue() score_rank_hist.rank_hist_svg = rank_svg_data.getvalue() session.save(score_rank_hist) session.flush() del score_png_data, score_svg_data, rank_png_data, rank_svg_data """
def run(self): """ 2008-08-20 """ self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 #2008-10-30 comment out because computing node is gonna save the stuff itself. if node_rank != output_node_rank: #to reduce the number of connections/queries to the master self.hostname = self.alter_hostname db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session if self.results_type == 1: ResultsClass = Stock_250kDB.ResultsMethod TestResultClass = Stock_250kDB.CandidateGeneTopSNPTestRM elif self.results_type == 2: ResultsClass = Stock_250kDB.ResultsByGene TestResultClass = Stock_250kDB.CandidateGeneTopSNPTest elif self.results_type == 3: ResultsClass = Stock_250kDB.ResultsMethod TestResultClass = Stock_250kDB.CandidateGeneTopSNPTestRG else: sys.stderr.write("Invalid results type : %s.\n" % pd.results_type) sys.exit(3) if node_rank == 0: pdata_for_computing = PassingData() pdata_for_computing.total_gene_id_ls = get_total_gene_ls( db.metadata.bind) pdata_for_computing.no_of_total_genes = len( pdata_for_computing.total_gene_id_ls) param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type) params_ls = self.generate_params(param_obj, self.min_no_of_genes) cutoff_ls = self.generate_cutoff_ls(self.no_of_top_snps, self.min_score, self.rank_gap, self.stop_rank, self.window_with_same_rank_gap) params_ls = self.addCutoffToParamsLs(cutoff_ls, params_ls) pdata_for_computing.snps_context_wrapper = self.dealWithSnpsContextWrapper( self.snps_context_picklef, self.min_distance, self.get_closest) if self.debug: params_ls = params_ls[:100] pdata_for_computing_pickle = cPickle.dumps(pdata_for_computing, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(pdata_for_computing_pickle, node, 0) sys.stderr.write(".\n") del pdata_for_computing_pickle del pdata_for_computing elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) data = cPickle.loads(data) sys.stderr.write(".\n") else: pass _type = self.getTopSNPTestType(self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type,\ self.test_type_id, self.null_distribution_type_id) self.synchronize() if node_rank == 0: parameter_list = [params_ls] self.input_node(parameter_list, free_computing_nodes, input_handler=self.input_handler, message_size=self.message_size) elif node_rank in free_computing_node_set: comp_param_obj = PassingData(snps_context_wrapper=data.snps_context_wrapper, \ results_directory=self.results_directory, min_MAF=self.min_MAF,\ no_of_total_genes=data.no_of_total_genes, \ total_gene_id_ls=data.total_gene_id_ls,\ type_id=_type.id, #_type is placeholder. output_node decides on this. session=session) self.computing_node(comp_param_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') header_row = [] for column in TestResultClass.c.keys(): header_row.append(column) writer.writerow(header_row) else: writer = None output_param_obj = PassingData(writer=writer, session=session, commit=self.commit, TestResultClass=TestResultClass, _type=_type) self.output_node(free_computing_nodes, output_param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits