Python getGeneIDSetGivenAccVer Examples

Programming Language: Python

Namespace/Package Name: pymodule.utils

Method/Function: getGeneIDSetGivenAccVer

Examples at hotexamples.com: 4

Python getGeneIDSetGivenAccVer - 4 examples found. These are the top rated real world Python examples of pymodule.utils.getGeneIDSetGivenAccVer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

	def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False):
		"""
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
		import csv, sys, os
		session = db.session
		delimiter=figureOutDelimiter(input_fname)
		inf = open(input_fname)	#2008-11-20
		if skip_1st_line:
			inf.next()	#skips the 1st line
		counter = 0
		success_counter = 0
		gene_id2original_name = {}	#to avoid redundancy in gene list
		for line in inf:
			if line=='\n':	#skip empty lines
				continue
			row = line.split(delimiter)
			original_name = row[0].strip()	#2008-12-11 remove spaces/tabs in the beginning/end
			all_number_p_search_result = self.all_number_p.search(original_name)
			if all_number_p_search_result:	# 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
				ecotypeid = int(all_number_p_search_result.group(0))
				gene_id_set = set([ecotypeid])
			else:
				gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set)
			
			if gene_id_set==None:
				sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name))
			elif len(gene_id_set)==1:
				gene_id = list(gene_id_set)[0]
				if gene_id not in gene_id2original_name:
					gene_id2original_name[gene_id] = original_name
				success_counter += 1
			elif len(gene_id_set)>1:
				sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set))
			elif len(gene_id_set)==0:
				sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name))
			else:
				sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set))
			counter += 1
		del inf
		
		if list_type_name:	#if the short name is given, forget about list_type_id
			glt = GeneListType.query.filter_by(short_name=list_type_name).first()	#try search the db first.
			if not glt:
				glt = GeneListType(short_name=list_type_name)
				session.save(glt)
				session.flush()
		else:	#use the list_type_id to get it
			glt = GeneListType.get(list_type_id)
		glt.original_filename = input_fname	#save the filename
		session.save_or_update(glt)
		
		for gene_id, original_name in gene_id2original_name.iteritems():
			if glt.id:	#2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
				rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id)
				if rows.count()>0:
					sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name))
					continue
			gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name)
			session.save(gl)
		sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))

Example #2

Show file

File: PutGeneListIntoDB.py Project: bopopescu/gwasmodules

    def putGeneListIntoDb(self,
                          input_fname,
                          list_type_id,
                          list_type_name,
                          gene_symbol2gene_id_set,
                          db,
                          skip_1st_line=False):
        """
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
        import csv, sys, os
        session = db.session
        delimiter = figureOutDelimiter(input_fname)
        inf = open(input_fname)  #2008-11-20
        if skip_1st_line:
            inf.next()  #skips the 1st line
        counter = 0
        success_counter = 0
        gene_id2original_name = {}  #to avoid redundancy in gene list
        for line in inf:
            if line == '\n':  #skip empty lines
                continue
            row = line.split(delimiter)
            original_name = row[0].strip(
            )  #2008-12-11 remove spaces/tabs in the beginning/end
            all_number_p_search_result = self.all_number_p.search(
                original_name)
            if all_number_p_search_result:  # 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
                ecotypeid = int(all_number_p_search_result.group(0))
                gene_id_set = set([ecotypeid])
            else:
                gene_id_set = getGeneIDSetGivenAccVer(original_name,
                                                      gene_symbol2gene_id_set)

            if gene_id_set == None:
                sys.stderr.write(
                    "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"
                    % (original_name))
            elif len(gene_id_set) == 1:
                gene_id = list(gene_id_set)[0]
                if gene_id not in gene_id2original_name:
                    gene_id2original_name[gene_id] = original_name
                success_counter += 1
            elif len(gene_id_set) > 1:
                sys.stderr.write("Too many gene_ids for %s: %s.\n" %
                                 (original_name, gene_id_set))
            elif len(gene_id_set) == 0:
                sys.stderr.write(
                    "Linking to gene id failed for %s. gene_id_set is empty.\n"
                    % (original_name))
            else:
                sys.stderr.write(
                    "not supposed to happen: original_name=%s, gene_id_set=%s\n."
                    % (original_name, gene_id_set))
            counter += 1
        del inf

        if list_type_name:  #if the short name is given, forget about list_type_id
            glt = GeneListType.query.filter_by(
                short_name=list_type_name).first()  #try search the db first.
            if not glt:
                glt = GeneListType(short_name=list_type_name)
                session.save(glt)
                session.flush()
        else:  #use the list_type_id to get it
            glt = GeneListType.get(list_type_id)
        glt.original_filename = input_fname  #save the filename
        session.save_or_update(glt)

        for gene_id, original_name in gene_id2original_name.iteritems():
            if glt.id:  #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
                rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(
                    list_type_id=glt.id)
                if rows.count() > 0:
                    sys.stderr.write(
                        "Gene: %s (%s) already with list type %s.\n" %
                        (gene_id, original_name, glt.short_name))
                    continue
            gl = GeneList(gene_id=gene_id,
                          list_type=glt,
                          original_name=original_name)
            session.save(gl)
        sys.stderr.write("%s/%s linked successfully.\n" %
                         (success_counter, counter))

Example #3

Show file

    def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set, gene_annotation, output_fname):
        """
		2009-2-5
			apply the improvement to any non-chromosome lines with 'ID' entry
			escape ';' by '%3B', which is regarded as a separator for every "name=value"
			escape ',' by '%2C', which is regarded as a separator for every "value"
			esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5')
		2009-2-4
			if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description
		"""
        sys.stderr.write("Improving TAIR Gene GFF with symbols and descriptions ...\n")
        import re

        p_ID_acc_ver = re.compile(r"ID=(\w+)\.(\d+);")
        p_ID_acc = re.compile(r"ID=(\w+);")
        p_ID_protein_acc = re.compile(r"ID=(\w+)\.(\d+)-Protein;")
        p_chr_name = re.compile(r"CHR\d+$")  # to esacpe gene_symbol/Alias whose value matches individual chromosome
        delimiter = figureOutDelimiter(input_fname)
        reader = csv.reader(open(input_fname), delimiter=delimiter)
        writer = csv.writer(
            open(output_fname, "w"), delimiter=delimiter, lineterminator="\n"
        )  # lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n').
        counter = 0
        success_counter = 0
        for row in reader:
            last_col = row[-1]
            tair_id = None
            if p_ID_acc_ver.search(last_col):
                tair_id, version = p_ID_acc_ver.search(last_col).groups()
            if p_ID_acc.search(last_col):
                tair_id, = p_ID_acc.search(last_col).groups()
            if p_ID_protein_acc.search(last_col):
                tair_id, version = p_ID_protein_acc.search(last_col).groups()
            counter += 1
            if tair_id is not None and row[2] != "chromosome":
                gene_id_set = getGeneIDSetGivenAccVer(tair_id, gene_symbol2gene_id_set)
                gene_id = None

                if gene_id_set == None:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n"
                        % (last_col, tair_id)
                    )
                elif len(gene_id_set) == 1:
                    gene_id = list(gene_id_set)[0]
                    success_counter += 1
                elif len(gene_id_set) > 1:
                    sys.stderr.write("Too many gene_ids: %s, %s.\n" % (tair_id, gene_id_set))
                elif len(gene_id_set) == 0:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n"
                        % (last_col, tair_id)
                    )
                else:
                    sys.stderr.write(
                        "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n."
                        % (last_col, tair_id, gene_id_set)
                    )
                if gene_id is not None:
                    gene_model = gene_annotation.gene_id2model.get(gene_id)
                    if gene_model is not None:
                        gene_commentary = gene_model.gene_commentaries[0]
                        gene_desc_ls = DrawSNPRegion.returnGeneDescLs(
                            self.gene_desc_names,
                            gene_model,
                            gene_commentary,
                            cutoff_length=600,
                            replaceNoneElemWithEmptyStr=1,
                        )
                        local_gene_desc_names = map(string.upper, self.gene_desc_names)
                        description = ",  ".join(
                            [": ".join(entry) for entry in zip(local_gene_desc_names, gene_desc_ls)]
                        )
                        description = description.replace(
                            ";", "%3B"
                        )  # escape ';', which is regarded as a separator for every "name=value"
                        description = description.replace(
                            ",", "%2C"
                        )  # escape ',', which is regarded as a separator for every "value"

                        if last_col[-1] != ";":  # no ; delimiter at the end, append one
                            last_col += ";"
                        gene_symbol = gene_model.gene_symbol
                        gene_symbol = gene_symbol.replace(";", "%3B")
                        gene_symbol = gene_symbol.replace(",", "%2C")
                        if p_chr_name.match(gene_symbol):  # match the chromosome name, change
                            gene_symbol = "Gene %s" % gene_symbol
                        last_col += "Alias=%s;" % gene_symbol
                        last_col += "description=%s" % description
                        row[-1] = last_col
            if last_col[-1] == ";":
                last_col = last_col[:-1]
                row[-1] = last_col
            if counter % 5000 == 0:
                sys.stderr.write("%s%s\t%s" % ("\x08" * 80, success_counter, counter))
            writer.writerow(row)

        sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))

Example #4

Show file

    def improveTAIRGeneGFF(self, input_fname, gene_symbol2gene_id_set,
                           gene_annotation, output_fname):
        """
		2009-2-5
			apply the improvement to any non-chromosome lines with 'ID' entry
			escape ';' by '%3B', which is regarded as a separator for every "name=value"
			escape ',' by '%2C', which is regarded as a separator for every "value"
			esacpe gene_symbol/Alias whose value matches individual chromosome (like gene 'CHR5' = 'Gene CHR5')
		2009-2-4
			if the last column has 'ID' in it, find corresponding gene id using its value and add gene_symbol and description
		"""
        sys.stderr.write(
            "Improving TAIR Gene GFF with symbols and descriptions ...\n")
        import re
        p_ID_acc_ver = re.compile(r'ID=(\w+)\.(\d+);')
        p_ID_acc = re.compile(r'ID=(\w+);')
        p_ID_protein_acc = re.compile(r'ID=(\w+)\.(\d+)-Protein;')
        p_chr_name = re.compile(
            r'CHR\d+$'
        )  #to esacpe gene_symbol/Alias whose value matches individual chromosome
        delimiter = figureOutDelimiter(input_fname)
        reader = csv.reader(open(input_fname), delimiter=delimiter)
        writer = csv.writer(
            open(output_fname, 'w'), delimiter=delimiter, lineterminator='\n'
        )  #lineterminator is important. GFF3Loader would break down if it's dos terminator('\r\n').
        counter = 0
        success_counter = 0
        for row in reader:
            last_col = row[-1]
            tair_id = None
            if p_ID_acc_ver.search(last_col):
                tair_id, version = p_ID_acc_ver.search(last_col).groups()
            if p_ID_acc.search(last_col):
                tair_id, = p_ID_acc.search(last_col).groups()
            if p_ID_protein_acc.search(last_col):
                tair_id, version = p_ID_protein_acc.search(last_col).groups()
            counter += 1
            if tair_id is not None and row[2] != 'chromosome':
                gene_id_set = getGeneIDSetGivenAccVer(tair_id,
                                                      gene_symbol2gene_id_set)
                gene_id = None

                if gene_id_set == None:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. No such gene_symbol, %s, in gene_symbol2gene_id_set.\n"
                        % (last_col, tair_id))
                elif len(gene_id_set) == 1:
                    gene_id = list(gene_id_set)[0]
                    success_counter += 1
                elif len(gene_id_set) > 1:
                    sys.stderr.write("Too many gene_ids: %s, %s.\n" %
                                     (tair_id, gene_id_set))
                elif len(gene_id_set) == 0:
                    sys.stderr.write(
                        "Linking to gene id failed for %s. There is gene_symbol, %s, in gene_symbol2gene_id_set but it's empty.\n"
                        % (last_col, tair_id))
                else:
                    sys.stderr.write(
                        "not supposed to happen: original_name=%s, gene_symbol=%s, gene_id_set=%s\n."
                        % (last_col, tair_id, gene_id_set))
                if gene_id is not None:
                    gene_model = gene_annotation.gene_id2model.get(gene_id)
                    if gene_model is not None:
                        gene_commentary = gene_model.gene_commentaries[0]
                        gene_desc_ls = DrawSNPRegion.returnGeneDescLs(self.gene_desc_names, gene_model, gene_commentary, cutoff_length=600,\
                                   replaceNoneElemWithEmptyStr=1)
                        local_gene_desc_names = map(string.upper,
                                                    self.gene_desc_names)
                        description = ',  '.join([
                            ': '.join(entry) for entry in zip(
                                local_gene_desc_names, gene_desc_ls)
                        ])
                        description = description.replace(
                            ';', '%3B'
                        )  #escape ';', which is regarded as a separator for every "name=value"
                        description = description.replace(
                            ',', '%2C'
                        )  #escape ',', which is regarded as a separator for every "value"

                        if last_col[
                                -1] != ';':  #no ; delimiter at the end, append one
                            last_col += ';'
                        gene_symbol = gene_model.gene_symbol
                        gene_symbol = gene_symbol.replace(';', '%3B')
                        gene_symbol = gene_symbol.replace(',', '%2C')
                        if p_chr_name.match(
                                gene_symbol
                        ):  #match the chromosome name, change
                            gene_symbol = 'Gene %s' % gene_symbol
                        last_col += 'Alias=%s;' % gene_symbol
                        last_col += 'description=%s' % description
                        row[-1] = last_col
            if last_col[-1] == ';':
                last_col = last_col[:-1]
                row[-1] = last_col
            if counter % 5000 == 0:
                sys.stderr.write("%s%s\t%s" %
                                 ('\x08' * 80, success_counter, counter))
            writer.writerow(row)

        sys.stderr.write("%s/%s Done.\n" % (success_counter, counter))