Esempio n. 1
0
	def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False):
		"""
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
		import csv, sys, os
		session = db.session
		delimiter=figureOutDelimiter(input_fname)
		inf = open(input_fname)	#2008-11-20
		if skip_1st_line:
			inf.next()	#skips the 1st line
		counter = 0
		success_counter = 0
		gene_id2original_name = {}	#to avoid redundancy in gene list
		for line in inf:
			if line=='\n':	#skip empty lines
				continue
			row = line.split(delimiter)
			original_name = row[0].strip()	#2008-12-11 remove spaces/tabs in the beginning/end
			all_number_p_search_result = self.all_number_p.search(original_name)
			if all_number_p_search_result:	# 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
				ecotypeid = int(all_number_p_search_result.group(0))
				gene_id_set = set([ecotypeid])
			else:
				gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set)
			
			if gene_id_set==None:
				sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name))
			elif len(gene_id_set)==1:
				gene_id = list(gene_id_set)[0]
				if gene_id not in gene_id2original_name:
					gene_id2original_name[gene_id] = original_name
				success_counter += 1
			elif len(gene_id_set)>1:
				sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set))
			elif len(gene_id_set)==0:
				sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name))
			else:
				sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set))
			counter += 1
		del inf
		
		if list_type_name:	#if the short name is given, forget about list_type_id
			glt = GeneListType.query.filter_by(short_name=list_type_name).first()	#try search the db first.
			if not glt:
				glt = GeneListType(short_name=list_type_name)
				session.save(glt)
				session.flush()
		else:	#use the list_type_id to get it
			glt = GeneListType.get(list_type_id)
		glt.original_filename = input_fname	#save the filename
		session.save_or_update(glt)
		
		for gene_id, original_name in gene_id2original_name.iteritems():
			if glt.id:	#2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
				rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id)
				if rows.count()>0:
					sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name))
					continue
			gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name)
			session.save(gl)
		sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))
Esempio n. 2
0
    def putGeneListIntoDb(self,
                          input_fname,
                          list_type_id,
                          list_type_name,
                          gene_symbol2gene_id_set,
                          db,
                          skip_1st_line=False):
        """
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
        import csv, sys, os
        session = db.session
        delimiter = figureOutDelimiter(input_fname)
        inf = open(input_fname)  #2008-11-20
        if skip_1st_line:
            inf.next()  #skips the 1st line
        counter = 0
        success_counter = 0
        gene_id2original_name = {}  #to avoid redundancy in gene list
        for line in inf:
            if line == '\n':  #skip empty lines
                continue
            row = line.split(delimiter)
            original_name = row[0].strip(
            )  #2008-12-11 remove spaces/tabs in the beginning/end
            all_number_p_search_result = self.all_number_p.search(
                original_name)
            if all_number_p_search_result:  # 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
                ecotypeid = int(all_number_p_search_result.group(0))
                gene_id_set = set([ecotypeid])
            else:
                gene_id_set = getGeneIDSetGivenAccVer(original_name,
                                                      gene_symbol2gene_id_set)

            if gene_id_set == None:
                sys.stderr.write(
                    "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"
                    % (original_name))
            elif len(gene_id_set) == 1:
                gene_id = list(gene_id_set)[0]
                if gene_id not in gene_id2original_name:
                    gene_id2original_name[gene_id] = original_name
                success_counter += 1
            elif len(gene_id_set) > 1:
                sys.stderr.write("Too many gene_ids for %s: %s.\n" %
                                 (original_name, gene_id_set))
            elif len(gene_id_set) == 0:
                sys.stderr.write(
                    "Linking to gene id failed for %s. gene_id_set is empty.\n"
                    % (original_name))
            else:
                sys.stderr.write(
                    "not supposed to happen: original_name=%s, gene_id_set=%s\n."
                    % (original_name, gene_id_set))
            counter += 1
        del inf

        if list_type_name:  #if the short name is given, forget about list_type_id
            glt = GeneListType.query.filter_by(
                short_name=list_type_name).first()  #try search the db first.
            if not glt:
                glt = GeneListType(short_name=list_type_name)
                session.save(glt)
                session.flush()
        else:  #use the list_type_id to get it
            glt = GeneListType.get(list_type_id)
        glt.original_filename = input_fname  #save the filename
        session.save_or_update(glt)

        for gene_id, original_name in gene_id2original_name.iteritems():
            if glt.id:  #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
                rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(
                    list_type_id=glt.id)
                if rows.count() > 0:
                    sys.stderr.write(
                        "Gene: %s (%s) already with list type %s.\n" %
                        (gene_id, original_name, glt.short_name))
                    continue
            gl = GeneList(gene_id=gene_id,
                          list_type=glt,
                          original_name=original_name)
            session.save(gl)
        sys.stderr.write("%s/%s linked successfully.\n" %
                         (success_counter, counter))
Esempio n. 3
0
    def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname):
        """
		2009-2-5
			apply the improvement to any non-chromosome lines with 'ID' entry
			escape ';' by '%3B', which is regarded as a separator for every "name=value"
			escape ',' by '%2C', which is regarded as a separator for every "value"
			esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5')
		2009-2-4
			if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description
		"""
        sys.stderr.write("Improving TAIR Gene GFF with symbols and descriptions ...\n")
        import re

        p_ID_acc_ver = re.compile(r"ID=(\w+)\.(\d+);")
        p_ID_acc = re.compile(r"ID=(\w+);")
        p_ID_protein_acc = re.compile(r"ID=(\w+)\.(\d+)-Protein;")
        p_chr_name = re.compile(r"CHR\d+$")  # to esacpe gene_symbol/Alias whose value matches individual chromosome
        delimiter = figureOutDelimiter(input_fname)
        reader = csv.reader(open(input_fname), delimiter=delimiter)
        writer = csv.writer(
            open(output_fname, "w"), delimiter=delimiter, lineterminator="\n"
        )  # lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n').
        counter = 0
        success_counter = 0
        for row in reader:
            last_col = row[-1]
            tair_id = None
            if p_ID_acc_ver.search(last_col):
                tair_id, version = p_ID_acc_ver.search(last_col).groups()
            if p_ID_acc.search(last_col):
                tair_id, = p_ID_acc.search(last_col).groups()
            if p_ID_protein_acc.search(last_col):
                tair_id, version = p_ID_protein_acc.search(last_col).groups()
            counter += 1
            if tair_id is not None and row[2] != "chromosome":
                gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set)
                gene_id = None

                if gene_id_set == None:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n"
                        % (last_col, tair_id)
                    )
                elif len(gene_id_set) == 1:
                    gene_id = list(gene_id_set)[0]
                    success_counter += 1
                elif len(gene_id_set) > 1:
                    sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set))
                elif len(gene_id_set) == 0:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n"
                        % (last_col, tair_id)
                    )
                else:
                    sys.stderr.write(
                        "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n."
                        % (last_col, tair_id, gene_id_set)
                    )
                if gene_id is not None:
                    gene_model = gene_annotation.gene_id2model.get(gene_id)
                    if gene_model is not None:
                        gene_commentary = gene_model.gene_commentaries[0]
                        gene_desc_ls = DrawSNPRegion.returnGeneDescLs(
                            self.gene_desc_names,
                            gene_model,
                            gene_commentary,
                            cutoff_length=600,
                            replaceNoneElemWithEmptyStr=1,
                        )
                        local_gene_desc_names = map(string.upper, self.gene_desc_names)
                        description = ",  ".join(
                            [": ".join(entry) for entry in zip(local_gene_desc_names, gene_desc_ls)]
                        )
                        description = description.replace(
                            ";", "%3B"
                        )  # escape ';', which is regarded as a separator for every "name=value"
                        description = description.replace(
                            ",", "%2C"
                        )  # escape ',', which is regarded as a separator for every "value"

                        if last_col[-1] != ";":  # no ; delimiter at the end, append one
                            last_col += ";"
                        gene_symbol = gene_model.gene_symbol
                        gene_symbol = gene_symbol.replace(";", "%3B")
                        gene_symbol = gene_symbol.replace(",", "%2C")
                        if p_chr_name.match(gene_symbol):  # match the chromosome name, change
                            gene_symbol = "Gene %s" % gene_symbol
                        last_col += "Alias=%s;" % gene_symbol
                        last_col += "description=%s" % description
                        row[-1] = last_col
            if last_col[-1] == ";":
                last_col = last_col[:-1]
                row[-1] = last_col
            if counter % 5000 == 0:
                sys.stderr.write("%s%s\t%s" % ("\x08" * 80, success_counter, counter))
            writer.writerow(row)

        sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))
Esempio n. 4
0
    def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set,
                           gene_annotation, output_fname):
        """
		2009-2-5
			apply the improvement to any non-chromosome lines with 'ID' entry
			escape ';' by '%3B', which is regarded as a separator for every "name=value"
			escape ',' by '%2C', which is regarded as a separator for every "value"
			esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5')
		2009-2-4
			if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description
		"""
        sys.stderr.write(
            "Improving TAIR Gene GFF with symbols and descriptions ...\n")
        import re
        p_ID_acc_ver = re.compile(r'ID=(\w+)\.(\d+);')
        p_ID_acc = re.compile(r'ID=(\w+);')
        p_ID_protein_acc = re.compile(r'ID=(\w+)\.(\d+)-Protein;')
        p_chr_name = re.compile(
            r'CHR\d+$'
        )  #to esacpe gene_symbol/Alias whose value matches individual chromosome
        delimiter = figureOutDelimiter(input_fname)
        reader = csv.reader(open(input_fname), delimiter=delimiter)
        writer = csv.writer(
            open(output_fname, 'w'), delimiter=delimiter, lineterminator='\n'
        )  #lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n').
        counter = 0
        success_counter = 0
        for row in reader:
            last_col = row[-1]
            tair_id = None
            if p_ID_acc_ver.search(last_col):
                tair_id, version = p_ID_acc_ver.search(last_col).groups()
            if p_ID_acc.search(last_col):
                tair_id, = p_ID_acc.search(last_col).groups()
            if p_ID_protein_acc.search(last_col):
                tair_id, version = p_ID_protein_acc.search(last_col).groups()
            counter += 1
            if tair_id is not None and row[2] != 'chromosome':
                gene_id_set = getGeneIDSetGivenAccVer(tair_id,
                                                      gene_symbol2gene_id_set)
                gene_id = None

                if gene_id_set == None:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n"
                        % (last_col, tair_id))
                elif len(gene_id_set) == 1:
                    gene_id = list(gene_id_set)[0]
                    success_counter += 1
                elif len(gene_id_set) > 1:
                    sys.stderr.write("Too many gene_ids: %s, %s.\n" %
                                     (tair_id, gene_id_set))
                elif len(gene_id_set) == 0:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n"
                        % (last_col, tair_id))
                else:
                    sys.stderr.write(
                        "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n."
                        % (last_col, tair_id, gene_id_set))
                if gene_id is not None:
                    gene_model = gene_annotation.gene_id2model.get(gene_id)
                    if gene_model is not None:
                        gene_commentary = gene_model.gene_commentaries[0]
                        gene_desc_ls = DrawSNPRegion.returnGeneDescLs(self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600,\
                                   replaceNoneElemWithEmptyStr=1)
                        local_gene_desc_names = map(string.upper,
                                                    self.gene_desc_names)
                        description = ',  '.join([
                            ': '.join(entry) for entry in zip(
                                local_gene_desc_names, gene_desc_ls)
                        ])
                        description = description.replace(
                            ';', '%3B'
                        )  #escape ';', which is regarded as a separator for every "name=value"
                        description = description.replace(
                            ',', '%2C'
                        )  #escape ',', which is regarded as a separator for every "value"

                        if last_col[
                                -1] != ';':  #no ; delimiter at the end, append one
                            last_col += ';'
                        gene_symbol = gene_model.gene_symbol
                        gene_symbol = gene_symbol.replace(';', '%3B')
                        gene_symbol = gene_symbol.replace(',', '%2C')
                        if p_chr_name.match(
                                gene_symbol
                        ):  #match the chromosome name, change
                            gene_symbol = 'Gene %s' % gene_symbol
                        last_col += 'Alias=%s;' % gene_symbol
                        last_col += 'description=%s' % description
                        row[-1] = last_col
            if last_col[-1] == ';':
                last_col = last_col[:-1]
                row[-1] = last_col
            if counter % 5000 == 0:
                sys.stderr.write("%s%s\t%s" %
                                 ('\x08' * 80, success_counter, counter))
            writer.writerow(row)

        sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))