def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb mysql_conn = MySQLdb.connect(db=self.dbname, host="banyan.usc.edu", user=self.db_user, passwd=self.db_passwd) mysql_curs = mysql_conn.cursor() from pymodule import get_gene_symbol2gene_id_set gene_symbol2gene_id_set = get_gene_symbol2gene_id_set( mysql_curs, 3702, table="genome.gene_symbol2id", upper_case_gene_symbol=1 ) # 3702 is At's tax id from variation.src.DrawSNPRegion import DrawSNPRegion DrawSNPRegion_ins = DrawSNPRegion( db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, database=self.dbname, input_fname="/tmp/dumb", output_dir="/tmp", debug=0, ) # input_fname and output_dir are just random stuff gene_annotation = DrawSNPRegion_ins.dealWithGeneAnnotation( self.gene_annotation_picklef, cls_with_db_args=DrawSNPRegion_ins ) self.improveTAIRGeneGFF(self.input_fname, gene_symbol2gene_id_set, gene_annotation, self.output_fname)
def run(self): if self.debug==1: import pdb pdb.set_trace() from DrawSNPRegion import DrawSNPRegion DrawSNPRegion_instance = DrawSNPRegion(db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, \ database=self.dbname, input_fname='/tmp/dumb', output_dir='/tmp', debug=0) grand_dataStructure = DrawSNPRegion_instance.loadDataStructure(self.gene_annotation_picklef, self.LD_info_picklef, self.LD_fname, min_MAF=self.min_MAF, min_distance=20000, list_type_id=None) snp_region_ls = self.get_snp_region_ls(self.input_fname, grand_dataStructure.snp_info, self.min_distance) value_criteria={(1, 'value'):8., (7, 'value'):6., (5, 'rank'):self.min_margarita_rank,(6, 'rank'):self.min_rf_rank} #minimum threshold for different analysis methods self.checkRegions(DrawSNPRegion_instance, grand_dataStructure.snp_info, snp_region_ls, self.output_fname, value_criteria)
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session #session.begin() snps_context_wrapper = GeneListRankTest.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(self.gene_annotation_picklef) snp_info = DrawSNPRegion.getSNPInfo(db) snp_annotation_short_name2id = self.getSNPAnnotationShortName2id() self._constructSNPAnnotation(session, snp_info, snps_context_wrapper, gene_annotation, snp_annotation_short_name2id) if self.commit: session.flush() session.commit()
def calculateOverlappingStatForOneCombo(self, db, phenotype_method_id, call_method_id, analysis_method_id_ls, \ no_of_top_snps=1000, association_overlapping_type=None, commit=False, \ results_directory=None): """ 2012.3.23 pass argument db_250k to ResultsMethod2Results.rm2result() 2009-11-2 """ sys.stderr.write("Calculating overlapping stat for phenotype %s and combo %s ...\n"%(phenotype_method_id, \ repr(analysis_method_id_ls),)) session = db.session snp_id_set_ls = [] for analysis_method_id in analysis_method_id_ls: rm = Stock_250kDB.ResultsMethod.query.filter_by(phenotype_method_id=phenotype_method_id).\ filter_by(call_method_id=call_method_id).filter_by(analysis_method_id=analysis_method_id).first() if rm.id in self.results_id2snp_id_set: snp_id_set = self.results_id2snp_id_set.get(rm.id) else: association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\ filter(Stock_250kDB.Results.rank<=no_of_top_snps) no_of_association_entries = association_entries.count() if no_of_association_entries < no_of_top_snps: min_rank = no_of_association_entries + 1 max_rank = no_of_top_snps if self.snp_info is None: self.snp_info = DrawSNPRegion.getSNPInfo(db) ResultsMethod2Results.rm2result(session, rm, self.snp_info, min_rank=min_rank, max_rank=max_rank, \ commit=commit, results_directory=results_directory, db_250k=db) association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\ filter(Stock_250kDB.Results.rank<=no_of_top_snps) no_of_association_entries = association_entries.count() if no_of_association_entries != no_of_top_snps: sys.stderr.write( "Error: The number of SNPs %s from Result %s (analysis_method_id %s) doesn't match the no_of_top_snps %s.\n" % (no_of_association_entries, rm.id, rm.analysis_method_id, no_of_top_snps)) return snp_id_set = set() for entry in association_entries: snp_id_set.add(entry.snps_id) self.results_id2snp_id_set[rm.id] = snp_id_set snp_id_set_ls.append(snp_id_set) overlapping_snp_id_set = snp_id_set_ls[0] for i in range(1, len(snp_id_set_ls)): snp_id_set = snp_id_set_ls[i] overlapping_snp_id_set = overlapping_snp_id_set & snp_id_set no_of_overlapping_snps = len(overlapping_snp_id_set) entry = Stock_250kDB.AssociationOverlappingStat(phenotype_method_id=phenotype_method_id, call_method_id=call_method_id, \ no_of_top_snps=no_of_top_snps, no_of_overlapping_snps=no_of_overlapping_snps) entry.overlapping_type = association_overlapping_type session.save(entry) session.flush() sys.stderr.write("%s overlapping SNPs out of %s results. Done.\n" % (no_of_overlapping_snps, len(snp_id_set_ls)))
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session #session.begin() snps_context_wrapper = GeneListRankTest.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, \ self.get_closest) gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(self.gene_annotation_picklef, tax_id=self.tax_id, \ cls_with_db_args=self) snp_info = DrawSNPRegion.getSNPInfo(db) snp_annotation_short_name2id = self.getSNPAnnotationShortName2id() self._constructSNPAnnotation(session, snp_info, snps_context_wrapper, gene_annotation, snp_annotation_short_name2id) if self.commit: session.flush() session.commit()
def calculateOverlappingStatForOneCombo(self, db, phenotype_method_id, call_method_id, analysis_method_id_ls, \ no_of_top_snps=1000, association_overlapping_type=None, commit=False, \ results_directory=None): """ 2009-11-2 """ sys.stderr.write("Calculating overlapping stat for phenotype %s and combo %s ...\n"%(phenotype_method_id, \ repr(analysis_method_id_ls),)) session = db.session snp_id_set_ls = [] for analysis_method_id in analysis_method_id_ls: rm = Stock_250kDB.ResultsMethod.query.filter_by(phenotype_method_id=phenotype_method_id).\ filter_by(call_method_id=call_method_id).filter_by(analysis_method_id=analysis_method_id).first() if rm.id in self.results_id2snp_id_set: snp_id_set = self.results_id2snp_id_set.get(rm.id) else: association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\ filter(Stock_250kDB.Results.rank<=no_of_top_snps) no_of_association_entries = association_entries.count() if no_of_association_entries<no_of_top_snps: min_rank = no_of_association_entries+1 max_rank = no_of_top_snps if self.snp_info is None: self.snp_info = DrawSNPRegion.getSNPInfo(db) ResultsMethod2Results.rm2result(session, rm, self.snp_info, min_rank=min_rank, max_rank=max_rank, \ commit=commit, results_directory=results_directory) association_entries = Stock_250kDB.Results.query.filter_by(results_id=rm.id).\ filter(Stock_250kDB.Results.rank<=no_of_top_snps) no_of_association_entries = association_entries.count() if no_of_association_entries!=no_of_top_snps: sys.stderr.write("Error: The number of SNPs %s from Result %s (analysis_method_id %s) doesn't match the no_of_top_snps %s.\n"%(no_of_association_entries, rm.id, rm.analysis_method_id, no_of_top_snps)) return snp_id_set = set() for entry in association_entries: snp_id_set.add(entry.snps_id) self.results_id2snp_id_set[rm.id] = snp_id_set snp_id_set_ls.append(snp_id_set) overlapping_snp_id_set = snp_id_set_ls[0] for i in range(1, len(snp_id_set_ls)): snp_id_set = snp_id_set_ls[i] overlapping_snp_id_set = overlapping_snp_id_set&snp_id_set no_of_overlapping_snps = len(overlapping_snp_id_set) entry = Stock_250kDB.AssociationOverlappingStat(phenotype_method_id=phenotype_method_id, call_method_id=call_method_id, \ no_of_top_snps=no_of_top_snps, no_of_overlapping_snps=no_of_overlapping_snps) entry.overlapping_type = association_overlapping_type session.save(entry) session.flush() sys.stderr.write("%s overlapping SNPs out of %s results. Done.\n"%(no_of_overlapping_snps, len(snp_id_set_ls)))
def on_button_draw_annotation_clicked(self, widget, data=None): """ 2008-12-16 use DrawSNPRegion.drawGeneModel() to draw gene models 2008-02-02 """ if not self.chr_id2size: sys.stderr.write("No genome-wide pvalue plot has been drawn yet. Do it first!\n") return #if not self.gene_id2model: # self.gene_id2model, self.chr_id2gene_id_ls = self.get_gene_id2model(self.postgres_curs, tax_id=3702) if not self.gene_annotation: self.db_connect() xlim = self.axe_gene_model.get_xlim() left_chr, left_pos = get_chr_pos_from_x_axis_pos(xlim[0], self.chr_gap, self.chr_id2cumu_size, self.chr_id_ls) right_chr, right_pos = get_chr_pos_from_x_axis_pos(xlim[1], self.chr_gap, self.chr_id2cumu_size, self.chr_id_ls) #fake a snps_within_this_region for drawGeneModel() snps_within_this_region = PassingData(chr_pos_ls=[[left_chr, left_pos],[right_chr, right_pos]]) base_y_value = 1 gene_width = 0.8 gene_position_cycle = 5 return_data = DrawSNPRegion.drawGeneModel(self.axe_gene_model, snps_within_this_region, self.gene_annotation, candidate_gene_set=None,\ gene_width=gene_width, gene_position_cycle=gene_position_cycle, base_y_value=base_y_value, \ gene_box_text_gap=20, label_gene=0, rotate_xy=False,\ chr_id2cumu_size=self.chr_id2cumu_size, chr_id2size=self.chr_id2size, chr_gap=self.chr_gap,\ artist_obj_id2artist_gene_id_ls=self.artist_obj_id2artist_gene_id_ls, \ gene_id2artist_object_id=self.gene_id2artist_object_id, drawGeneOnTheBoundary=False) #set drawGeneOnTheBoundary to False because later adding text to these genes would corrupt the running program. self.axe_gene_model.set_ylim([base_y_value-gene_width, gene_position_cycle+gene_width*2]) """ for gene_id in self.chr_id2gene_id_ls[left_chr]: gene_model = self.gene_id2model[gene_id] if gene_model.start!=None and gene_model.stop!=None and gene_model.stop>left_pos and gene_id not in self.gene_id2artist_object_id: if left_chr==right_chr: #same chromosome if gene_model.start>right_pos: #totally out of range, skip it continue y_value = len(self.gene_id2artist_object_id)%4 #cycling through the y position to avoid clogging self.plot_one_gene(self.ax, gene_id, self.gene_id2model, self.chr_id2cumu_size, self.chr_id2size, self.chr_gap, y_value=-1-y_value, gene_width=self.gene_width) if left_chr!=right_chr: for gene_id in self.chr_id2gene_id_ls[right_chr]: gene_model = self.gene_id2model[gene_id] if gene_model.start!=None and gene_model.stop!=None and gene_model.start<right_pos and gene_id not in self.gene_id2artist_object_id: y_value = len(self.gene_id2artist_object_id)%4 #cycling through the y position to avoid clogging self.plot_one_gene(self.ax, gene_id, self.gene_id2model, self.chr_id2cumu_size, self.chr_id2size, self.chr_gap, y_value=-1-y_value, gene_width=self.gene_width) """ self.canvas_matplotlib.draw()
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb mysql_conn = MySQLdb.connect(db=self.dbname, host='banyan.usc.edu', user=self.db_user, passwd=self.db_passwd) mysql_curs = mysql_conn.cursor() from pymodule import get_gene_symbol2gene_id_set gene_symbol2gene_id_set = get_gene_symbol2gene_id_set( mysql_curs, 3702, table='genome.gene_symbol2id', upper_case_gene_symbol=1) #3702 is At's tax id from variation.src.DrawSNPRegion import DrawSNPRegion DrawSNPRegion_ins = DrawSNPRegion(db_user=self.db_user, db_passwd=self.db_passwd, hostname=self.hostname, database=self.dbname,\ input_fname='/tmp/dumb', output_dir='/tmp', debug=0) #input_fname and output_dir are just random stuff gene_annotation = DrawSNPRegion_ins.dealWithGeneAnnotation( self.gene_annotation_picklef, cls_with_db_args=DrawSNPRegion_ins) self.improveTAIRGeneGFF(self.input_fname, gene_symbol2gene_id_set, gene_annotation, self.output_fname)
def db_connect(self): """ 2010-1-15 pass "cls_with_db_args=self" to DrawSNPRegion.dealWithGeneAnnotation() 2009-12-09 add db_user, db_passwd to MySQLdb.connect() 2008-12-16 add gene_annotation_picklef 2008-02-01 read the data in dialog_db_connect and establish the connections to two databases """ sys.stderr.write("Database Connecting ...") self.drivername = 'mysql' self.hostname = self.entry_mysql_hostname.get_text() self.dbname = self.entry_mysql_dbname.get_text() self.db_user = self.xml.get_widget("entry_db_user").get_text() self.db_passwd = self.xml.get_widget("entry_db_passwd").get_text() import MySQLdb try: self.mysql_conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) self.mysql_curs = self.mysql_conn.cursor() self.db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) self.db.setup(create_tables=False) self.session = self.db.session except: sys.stderr.write('DB connection error: %s\n'%repr(sys.exc_info())) traceback.print_exc() if not self.gene_annotation: gene_annotation_picklef = self.entry_gene_annotation_picklef.get_text() self.gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_picklef, cls_with_db_args=self) #2010-1-13 for postgresql. commented out #hostname = self.entry_postgres_hostname.get_text() #dbname = self.entry_postgres_dbname.get_text() #schema = self.entry_postgres_schema.get_text() #from annot.bin.codense.common import db_connect #2008-12-16 don't need postgres conn anymore #self.postgres_conn, self.postgres_curs = db_connect(hostname, dbname, schema) sys.stderr.write("Done.\n")
def run(self): """ 2009-6-10 set Results.beta = getattr(data_obj, 'beta1', None) """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session session.begin() snp_info = DrawSNPRegion.getSNPInfo(db) query = Stock_250kDB.ResultsMethod.query.filter_by(call_method_id=self.call_method_id).\ filter(Stock_250kDB.ResultsMethod.analysis_method_id.in_(self.analysis_method_id_ls)) for rm in query: self.rm2result(session, rm, snp_info, max_rank=self.max_rank, commit=self.commit, results_directory=self.results_directory) if self.commit: session.commit()
def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname): """ 2009-2-5 apply the improvement to any non-chromosome lines with 'ID' entry escape ';' by '%3B', which is regarded as a separator for every "name=value" escape ',' by '%2C', which is regarded as a separator for every "value" esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5') 2009-2-4 if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description """ sys.stderr.write("Improving TAIR Gene GFF with symbols and descriptions ...\n") import re p_ID_acc_ver = re.compile(r"ID=(\w+)\.(\d+);") p_ID_acc = re.compile(r"ID=(\w+);") p_ID_protein_acc = re.compile(r"ID=(\w+)\.(\d+)-Protein;") p_chr_name = re.compile(r"CHR\d+$") # to esacpe gene_symbol/Alias whose value matches individual chromosome delimiter = figureOutDelimiter(input_fname) reader = csv.reader(open(input_fname), delimiter=delimiter) writer = csv.writer( open(output_fname, "w"), delimiter=delimiter, lineterminator="\n" ) # lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n'). counter = 0 success_counter = 0 for row in reader: last_col = row[-1] tair_id = None if p_ID_acc_ver.search(last_col): tair_id, version = p_ID_acc_ver.search(last_col).groups() if p_ID_acc.search(last_col): tair_id, = p_ID_acc.search(last_col).groups() if p_ID_protein_acc.search(last_col): tair_id, version = p_ID_protein_acc.search(last_col).groups() counter += 1 if tair_id is not None and row[2] != "chromosome": gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set) gene_id = None if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n" % (last_col, tair_id) ) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n" % (last_col, tair_id) ) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n." % (last_col, tair_id, gene_id_set) ) if gene_id is not None: gene_model = gene_annotation.gene_id2model.get(gene_id) if gene_model is not None: gene_commentary = gene_model.gene_commentaries[0] gene_desc_ls = DrawSNPRegion.returnGeneDescLs( self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600, replaceNoneElemWithEmptyStr=1, ) local_gene_desc_names = map(string.upper, self.gene_desc_names) description = ", ".join( [": ".join(entry) for entry in zip(local_gene_desc_names, gene_desc_ls)] ) description = description.replace( ";", "%3B" ) # escape ';', which is regarded as a separator for every "name=value" description = description.replace( ",", "%2C" ) # escape ',', which is regarded as a separator for every "value" if last_col[-1] != ";": # no ; delimiter at the end, append one last_col += ";" gene_symbol = gene_model.gene_symbol gene_symbol = gene_symbol.replace(";", "%3B") gene_symbol = gene_symbol.replace(",", "%2C") if p_chr_name.match(gene_symbol): # match the chromosome name, change gene_symbol = "Gene %s" % gene_symbol last_col += "Alias=%s;" % gene_symbol last_col += "description=%s" % description row[-1] = last_col if last_col[-1] == ";": last_col = last_col[:-1] row[-1] = last_col if counter % 5000 == 0: sys.stderr.write("%s%s\t%s" % ("\x08" * 80, success_counter, counter)) writer.writerow(row) sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))
def run(self): """ 2008-12-08 if the plot under configuration is already in db, abort only if the program is gonna commit the database transaction. 2008-10-19 save figures in database if commit """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session #session.begin() if self.results_type==1: ResultsClass = Stock_250kDB.ResultsMethod snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest) elif self.results_type==2: ResultsClass = Stock_250kDB.ResultsByGene else: sys.stderr.write("Invalid results type : %s.\n"%self.results_type) return None hist_type = self.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \ self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id) candidate_gene_list = self.getGeneList(self.list_type_id) if len(candidate_gene_list)<self.min_sample_size: sys.stderr.write("Candidate gene list of %s too small: %s.\n"%(self.list_type_id, len(candidate_gene_list))) sys.exit(4) #candidate_gene_list = [] #2009-01-12 just to plot the histogram of pvalue candidate_gene_set = Set(candidate_gene_list) list_type = Stock_250kDB.GeneListType.get(self.list_type_id) if list_type is None: sys.exit(3) phenotype_id2results_id_ls = self.getResultsIDLs(db, ResultsClass, self.results_type, self.phenotype_id_ls, \ self.min_distance, self.get_closest, self.min_MAF, self.call_method_id) param_data = PassingData(results_directory=self.results_directory, candidate_gene_list=candidate_gene_list, \ min_MAF=self.min_MAF, allow_two_sample_overlapping=self.allow_two_sample_overlapping, need_the_value=1, \ do_log10_transformation=False) #need_the_value means to get the pvalue/score #force no log10 transformation. otherwise, transformation based on analysis_method if self.null_distribution_type_id==2 or self.null_distribution_type_id==3: #gw-looping or random gene list snp_info = DrawSNPRegion.getSNPInfo(db) candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, snp_info.chr_pos_ls, snps_context_wrapper) no_of_snps = len(snp_info.chr_pos_ls) no_of_permutations = no_of_snps/len(candidate_gene_snp_index_ls) + 1 param_data.chr_pos2index = snp_info.chr_pos2index #pass to getGenomeWideResultFromFile if self.null_distribution_type_id==2: non_candidate_gene_snp_index_ls = self.get_non_candidate_gene_snp_index_ls_by_permutation(candidate_gene_snp_index_ls, no_of_snps, no_of_permutations) elif self.null_distribution_type_id == 3: gene_id_ls = get_total_gene_ls(db.metadata.bind) no_of_candidate_genes = len(candidate_gene_set) non_candidate_gene_snp_index_ls = numpy.zeros(0, numpy.int) while len(non_candidate_gene_snp_index_ls)<no_of_snps: non_candidate_gene_set = Set(random.sample(gene_id_ls, no_of_candidate_genes)) _non_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(non_candidate_gene_set, snp_info.chr_pos_ls, snps_context_wrapper) non_candidate_gene_snp_index_ls = numpy.hstack((non_candidate_gene_snp_index_ls, _non_candidate_gene_snp_index_ls)) for phenotype_id, results_id_ls in phenotype_id2results_id_ls.iteritems(): if hist_type.id: #hist_type already in database rows = Stock_250kDB.ScoreRankHistogram.query.filter_by(phenotype_method_id=phenotype_id).\ filter_by(list_type_id=self.list_type_id).filter_by(hist_type_id=hist_type.id) if rows.count()>0 and self.commit: #2008-12-08 only skip if the database transaction is gonna commit. row = rows.first() sys.stderr.write("Histogram already in database. id=%s, phenotype_id=%s, list_type_id=%s, hist_type_id=%s.\n"%\ (row.id, row.phenotype_method_id, row.list_type_id, row.hist_type_id)) continue phenotype_method = Stock_250kDB.PhenotypeMethod.get(phenotype_id) if not phenotype_method: continue score_rank_data_ls = [] sys.stderr.write("Checking phenotype %s (%s) on list_type %s (%s) ...\n"%\ (phenotype_method.id, phenotype_method.short_name, list_type.id, list_type.short_name)) for results_id in results_id_ls: try: rm = ResultsClass.get(results_id) score_rank_data = None if self.null_distribution_type_id==1: if self.results_type==1: permData = self.prepareDataForPermutationRankTest(rm, snps_context_wrapper, param_data) if not permData: continue score_rank_data = PassingData(candidate_score_ls=permData.candidate_gene_snp_value_ls, \ candidate_rank_ls=permData.candidate_gene_snp_rank_ls,\ non_candidate_score_ls=permData.non_candidate_gene_snp_value_ls, non_candidate_rank_ls=permData.non_candidate_gene_snp_rank_ls,\ analysis_method=rm.analysis_method) del permData elif self.results_type==2: score_rank_data = self.getScoreRankFromRBG(rm, candidate_gene_set, self.results_directory) elif self.null_distribution_type_id==2 or self.null_distribution_type_id==3: genome_wide_result = self.getResultMethodContent(rm, param_data.results_directory, param_data.min_MAF, pdata=param_data) if not genome_wide_result: continue score_rank_data = self.getScoreRankFromPermIndexLs(genome_wide_result, candidate_gene_snp_index_ls, non_candidate_gene_snp_index_ls) if score_rank_data: score_rank_data.analysis_method = rm.analysis_method if score_rank_data: score_rank_data_ls.append(score_rank_data) except: sys.stderr.write("Exception happened for results_id=%s, phenotype_id=%s.\n"%(results_id, phenotype_id)) traceback.print_exc() sys.stderr.write('%s.\n'%repr(sys.exc_info())) continue if score_rank_data_ls: score_png_data, score_svg_data = self.plotHistForOnePhenotype(phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='score', commit=self.commit) rank_png_data, rank_svg_data = self.plotHistForOnePhenotype(phenotype_method, list_type, score_rank_data_ls, self.output_dir, data_type='rank', commit=self.commit) if self.commit: score_rank_hist = Stock_250kDB.ScoreRankHistogram(phenotype_method_id=phenotype_id, list_type_id=list_type.id) score_rank_hist.hist_type = hist_type score_rank_hist.score_hist = score_png_data.getvalue() score_rank_hist.score_hist_svg = score_svg_data.getvalue() score_rank_hist.rank_hist = rank_png_data.getvalue() score_rank_hist.rank_hist_svg = rank_svg_data.getvalue() session.save(score_rank_hist) session.flush() del score_png_data, score_svg_data, rank_png_data, rank_svg_data """
def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname): """ 2009-2-5 apply the improvement to any non-chromosome lines with 'ID' entry escape ';' by '%3B', which is regarded as a separator for every "name=value" escape ',' by '%2C', which is regarded as a separator for every "value" esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5') 2009-2-4 if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description """ sys.stderr.write( "Improving TAIR Gene GFF with symbols and descriptions ...\n") import re p_ID_acc_ver = re.compile(r'ID=(\w+)\.(\d+);') p_ID_acc = re.compile(r'ID=(\w+);') p_ID_protein_acc = re.compile(r'ID=(\w+)\.(\d+)-Protein;') p_chr_name = re.compile( r'CHR\d+$' ) #to esacpe gene_symbol/Alias whose value matches individual chromosome delimiter = figureOutDelimiter(input_fname) reader = csv.reader(open(input_fname), delimiter=delimiter) writer = csv.writer( open(output_fname, 'w'), delimiter=delimiter, lineterminator='\n' ) #lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n'). counter = 0 success_counter = 0 for row in reader: last_col = row[-1] tair_id = None if p_ID_acc_ver.search(last_col): tair_id, version = p_ID_acc_ver.search(last_col).groups() if p_ID_acc.search(last_col): tair_id, = p_ID_acc.search(last_col).groups() if p_ID_protein_acc.search(last_col): tair_id, version = p_ID_protein_acc.search(last_col).groups() counter += 1 if tair_id is not None and row[2] != 'chromosome': gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set) gene_id = None if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n" % (last_col, tair_id)) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n" % (last_col, tair_id)) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n." % (last_col, tair_id, gene_id_set)) if gene_id is not None: gene_model = gene_annotation.gene_id2model.get(gene_id) if gene_model is not None: gene_commentary = gene_model.gene_commentaries[0] gene_desc_ls = DrawSNPRegion.returnGeneDescLs(self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600,\ replaceNoneElemWithEmptyStr=1) local_gene_desc_names = map(string.upper, self.gene_desc_names) description = ', '.join([ ': '.join(entry) for entry in zip( local_gene_desc_names, gene_desc_ls) ]) description = description.replace( ';', '%3B' ) #escape ';', which is regarded as a separator for every "name=value" description = description.replace( ',', '%2C' ) #escape ',', which is regarded as a separator for every "value" if last_col[ -1] != ';': #no ; delimiter at the end, append one last_col += ';' gene_symbol = gene_model.gene_symbol gene_symbol = gene_symbol.replace(';', '%3B') gene_symbol = gene_symbol.replace(',', '%2C') if p_chr_name.match( gene_symbol ): #match the chromosome name, change gene_symbol = 'Gene %s' % gene_symbol last_col += 'Alias=%s;' % gene_symbol last_col += 'description=%s' % description row[-1] = last_col if last_col[-1] == ';': last_col = last_col[:-1] row[-1] = last_col if counter % 5000 == 0: sys.stderr.write("%s%s\t%s" % ('\x08' * 80, success_counter, counter)) writer.writerow(row) sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))