def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False): """ 2009-10-18 If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID. 2009-2-4 use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set 2008-01-08 add option skip_1st_line stop using csv.reader, use raw file handler instead figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column. 2008-12-11 more filtering: 1. strip the original_name 2. pick alphanumeric characters out of original_name if GeneListType is already in db. check if GeneList has this gene already or not. 2008-11-20 use figureOutDelimiter() to get delimiter automatically 2008-07-15 if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry. 2008-07-15 use gene_id2original_name to avoid redundancy in gene list """ import csv, sys, os session = db.session delimiter=figureOutDelimiter(input_fname) inf = open(input_fname) #2008-11-20 if skip_1st_line: inf.next() #skips the 1st line counter = 0 success_counter = 0 gene_id2original_name = {} #to avoid redundancy in gene list for line in inf: if line=='\n': #skip empty lines continue row = line.split(delimiter) original_name = row[0].strip() #2008-12-11 remove spaces/tabs in the beginning/end all_number_p_search_result = self.all_number_p.search(original_name) if all_number_p_search_result: # 2009-10-18 original_name is full of numbers. a legitimate Gene ID. ecotypeid = int(all_number_p_search_result.group(0)) gene_id_set = set([ecotypeid]) else: gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set) if gene_id_set==None: sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name)) elif len(gene_id_set)==1: gene_id = list(gene_id_set)[0] if gene_id not in gene_id2original_name: gene_id2original_name[gene_id] = original_name success_counter += 1 elif len(gene_id_set)>1: sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set)) elif len(gene_id_set)==0: sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name)) else: sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set)) counter += 1 del inf if list_type_name: #if the short name is given, forget about list_type_id glt = GeneListType.query.filter_by(short_name=list_type_name).first() #try search the db first. if not glt: glt = GeneListType(short_name=list_type_name) session.save(glt) session.flush() else: #use the list_type_id to get it glt = GeneListType.get(list_type_id) glt.original_filename = input_fname #save the filename session.save_or_update(glt) for gene_id, original_name in gene_id2original_name.iteritems(): if glt.id: #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not. rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id) if rows.count()>0: sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name)) continue gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name) session.save(gl) sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))
def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False): """ 2009-10-18 If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID. 2009-2-4 use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set 2008-01-08 add option skip_1st_line stop using csv.reader, use raw file handler instead figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column. 2008-12-11 more filtering: 1. strip the original_name 2. pick alphanumeric characters out of original_name if GeneListType is already in db. check if GeneList has this gene already or not. 2008-11-20 use figureOutDelimiter() to get delimiter automatically 2008-07-15 if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry. 2008-07-15 use gene_id2original_name to avoid redundancy in gene list """ import csv, sys, os session = db.session delimiter = figureOutDelimiter(input_fname) inf = open(input_fname) #2008-11-20 if skip_1st_line: inf.next() #skips the 1st line counter = 0 success_counter = 0 gene_id2original_name = {} #to avoid redundancy in gene list for line in inf: if line == '\n': #skip empty lines continue row = line.split(delimiter) original_name = row[0].strip( ) #2008-12-11 remove spaces/tabs in the beginning/end all_number_p_search_result = self.all_number_p.search( original_name) if all_number_p_search_result: # 2009-10-18 original_name is full of numbers. a legitimate Gene ID. ecotypeid = int(all_number_p_search_result.group(0)) gene_id_set = set([ecotypeid]) else: gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set) if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n" % (original_name)) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] if gene_id not in gene_id2original_name: gene_id2original_name[gene_id] = original_name success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids for %s: %s.\n" % (original_name, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. gene_id_set is empty.\n" % (original_name)) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_id_set=%s\n." % (original_name, gene_id_set)) counter += 1 del inf if list_type_name: #if the short name is given, forget about list_type_id glt = GeneListType.query.filter_by( short_name=list_type_name).first() #try search the db first. if not glt: glt = GeneListType(short_name=list_type_name) session.save(glt) session.flush() else: #use the list_type_id to get it glt = GeneListType.get(list_type_id) glt.original_filename = input_fname #save the filename session.save_or_update(glt) for gene_id, original_name in gene_id2original_name.iteritems(): if glt.id: #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not. rows = GeneList.query.filter_by(gene_id=gene_id).filter_by( list_type_id=glt.id) if rows.count() > 0: sys.stderr.write( "Gene: %s (%s) already with list type %s.\n" % (gene_id, original_name, glt.short_name)) continue gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name) session.save(gl) sys.stderr.write("%s/%s linked successfully.\n" % (success_counter, counter))
def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname): """ 2009-2-5 apply the improvement to any non-chromosome lines with 'ID' entry escape ';' by '%3B', which is regarded as a separator for every "name=value" escape ',' by '%2C', which is regarded as a separator for every "value" esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5') 2009-2-4 if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description """ sys.stderr.write("Improving TAIR Gene GFF with symbols and descriptions ...\n") import re p_ID_acc_ver = re.compile(r"ID=(\w+)\.(\d+);") p_ID_acc = re.compile(r"ID=(\w+);") p_ID_protein_acc = re.compile(r"ID=(\w+)\.(\d+)-Protein;") p_chr_name = re.compile(r"CHR\d+$") # to esacpe gene_symbol/Alias whose value matches individual chromosome delimiter = figureOutDelimiter(input_fname) reader = csv.reader(open(input_fname), delimiter=delimiter) writer = csv.writer( open(output_fname, "w"), delimiter=delimiter, lineterminator="\n" ) # lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n'). counter = 0 success_counter = 0 for row in reader: last_col = row[-1] tair_id = None if p_ID_acc_ver.search(last_col): tair_id, version = p_ID_acc_ver.search(last_col).groups() if p_ID_acc.search(last_col): tair_id, = p_ID_acc.search(last_col).groups() if p_ID_protein_acc.search(last_col): tair_id, version = p_ID_protein_acc.search(last_col).groups() counter += 1 if tair_id is not None and row[2] != "chromosome": gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set) gene_id = None if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n" % (last_col, tair_id) ) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n" % (last_col, tair_id) ) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n." % (last_col, tair_id, gene_id_set) ) if gene_id is not None: gene_model = gene_annotation.gene_id2model.get(gene_id) if gene_model is not None: gene_commentary = gene_model.gene_commentaries[0] gene_desc_ls = DrawSNPRegion.returnGeneDescLs( self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600, replaceNoneElemWithEmptyStr=1, ) local_gene_desc_names = map(string.upper, self.gene_desc_names) description = ", ".join( [": ".join(entry) for entry in zip(local_gene_desc_names, gene_desc_ls)] ) description = description.replace( ";", "%3B" ) # escape ';', which is regarded as a separator for every "name=value" description = description.replace( ",", "%2C" ) # escape ',', which is regarded as a separator for every "value" if last_col[-1] != ";": # no ; delimiter at the end, append one last_col += ";" gene_symbol = gene_model.gene_symbol gene_symbol = gene_symbol.replace(";", "%3B") gene_symbol = gene_symbol.replace(",", "%2C") if p_chr_name.match(gene_symbol): # match the chromosome name, change gene_symbol = "Gene %s" % gene_symbol last_col += "Alias=%s;" % gene_symbol last_col += "description=%s" % description row[-1] = last_col if last_col[-1] == ";": last_col = last_col[:-1] row[-1] = last_col if counter % 5000 == 0: sys.stderr.write("%s%s\t%s" % ("\x08" * 80, success_counter, counter)) writer.writerow(row) sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))
def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname): """ 2009-2-5 apply the improvement to any non-chromosome lines with 'ID' entry escape ';' by '%3B', which is regarded as a separator for every "name=value" escape ',' by '%2C', which is regarded as a separator for every "value" esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5') 2009-2-4 if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description """ sys.stderr.write( "Improving TAIR Gene GFF with symbols and descriptions ...\n") import re p_ID_acc_ver = re.compile(r'ID=(\w+)\.(\d+);') p_ID_acc = re.compile(r'ID=(\w+);') p_ID_protein_acc = re.compile(r'ID=(\w+)\.(\d+)-Protein;') p_chr_name = re.compile( r'CHR\d+$' ) #to esacpe gene_symbol/Alias whose value matches individual chromosome delimiter = figureOutDelimiter(input_fname) reader = csv.reader(open(input_fname), delimiter=delimiter) writer = csv.writer( open(output_fname, 'w'), delimiter=delimiter, lineterminator='\n' ) #lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n'). counter = 0 success_counter = 0 for row in reader: last_col = row[-1] tair_id = None if p_ID_acc_ver.search(last_col): tair_id, version = p_ID_acc_ver.search(last_col).groups() if p_ID_acc.search(last_col): tair_id, = p_ID_acc.search(last_col).groups() if p_ID_protein_acc.search(last_col): tair_id, version = p_ID_protein_acc.search(last_col).groups() counter += 1 if tair_id is not None and row[2] != 'chromosome': gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set) gene_id = None if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n" % (last_col, tair_id)) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n" % (last_col, tair_id)) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n." % (last_col, tair_id, gene_id_set)) if gene_id is not None: gene_model = gene_annotation.gene_id2model.get(gene_id) if gene_model is not None: gene_commentary = gene_model.gene_commentaries[0] gene_desc_ls = DrawSNPRegion.returnGeneDescLs(self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600,\ replaceNoneElemWithEmptyStr=1) local_gene_desc_names = map(string.upper, self.gene_desc_names) description = ', '.join([ ': '.join(entry) for entry in zip( local_gene_desc_names, gene_desc_ls) ]) description = description.replace( ';', '%3B' ) #escape ';', which is regarded as a separator for every "name=value" description = description.replace( ',', '%2C' ) #escape ',', which is regarded as a separator for every "value" if last_col[ -1] != ';': #no ; delimiter at the end, append one last_col += ';' gene_symbol = gene_model.gene_symbol gene_symbol = gene_symbol.replace(';', '%3B') gene_symbol = gene_symbol.replace(',', '%2C') if p_chr_name.match( gene_symbol ): #match the chromosome name, change gene_symbol = 'Gene %s' % gene_symbol last_col += 'Alias=%s;' % gene_symbol last_col += 'description=%s' % description row[-1] = last_col if last_col[-1] == ';': last_col = last_col[:-1] row[-1] = last_col if counter % 5000 == 0: sys.stderr.write("%s%s\t%s" % ('\x08' * 80, success_counter, counter)) writer.writerow(row) sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))