Esempio n. 1
0
    def handle(self, *args, **options):
        database = CrossRefDB.objects.get(name=options.get('dbname'))
        wb_url = options.get('wburl')

        xrefs_gzip_fh = gzip.GzipFile(
            fileobj=StringIO(urllib2.urlopen(wb_url, timeout=5).read()))

        for line in xrefs_gzip_fh:
            toks = line.strip().split('\t')
            systematic = 'CELE_' + toks[0]
            wbid = toks[1]
            try:
                gene = Gene.objects.get(systematic_name=systematic)
            except Gene.DoesNotExist:
                logger.info("Unable to find gene %s.", systematic)
                continue
            wb = None
            try:
                wb = CrossRef.objects.get(xrid=wbid, crossrefdb=database)
            except CrossRef.DoesNotExist:
                wb = CrossRef(xrid=wbid, crossrefdb=database)
            wb.gene = gene
            wb.save()

        xrefs_gzip_fh.close()
    def handle(self, *args, **options):
        # Load the organism.
        tax_id = options.get('taxonomy_id')
        org = Organism.objects.get(taxonomy_id=tax_id)

        # geneinfo file information.
        geneinfo_filename = options.get('geneinfo')
        symb_col = int(options.get('symbol_col'))
        syst_col = int(options.get('systematic_col'))
        alias_col = int(options.get('alias_col'))
        systematic_xrdb = options.get('systematic_xrdb')

        # Open the geneinfo file.
        if geneinfo_filename:
            geneinfo_fh = open(geneinfo_filename)

        # yeast has a taxonomy_id that changed, in this case when we look at
        # the id from NCBI we have to use the new one.
        gi_tax_id = tax_id
        if options.get('gi_tax_id'):
            gi_tax_id = options.get('gi_tax_id')

        # Get all genes for this organism from the database.
        entrez_in_db = set(
            Gene.objects.filter(organism=org).values_list('entrezid',
                                                          flat=True))

        # Get all cross reference pairs that refer to a gene from this
        # organism.
        xr_in_db = set()
        for x in CrossRef.objects.filter(
                gene__entrezid__in=entrez_in_db).prefetch_related(
                    'crossrefdb', 'gene'):
            xr_in_db.add((x.crossrefdb.name, x.xrid, x.gene.entrezid))

        if tax_id and geneinfo_fh:
            # Store all the genes seen thus far so we can remove obsolete
            # entries.
            entrez_seen = set()
            # Store all the crossref pairs seen thus far to avoid duplicates.
            # Cache of cross reference databases, which saves hits to DB.
            xrdb_cache = {}
            # Check to make sure the organism matched so that we don't mass-
            # delete for no reason.
            org_matches = 0
            entrez_found = 0  # Found from before.
            entrez_updated = 0  # Found from before and updated.
            entrez_created = 0  # Didn't exist, added.
            for line in geneinfo_fh:
                toks = line.strip().split('\t')
                if toks[symb_col] == "NEWENTRY":
                    logger.info("NEWENTRY line skipped")
                    continue

                if not (toks[0] == gi_tax_id):  # From wrong organism, skip.
                    continue

                org_matches += 1  # Count lines that came from this organism.
                # Grab requested fields from tab delimited file.
                (entrezid, standard_name, systematic_name, aliases, crossrefs,
                 description, status,
                 chromosome) = (int(toks[1]), toks[symb_col], toks[syst_col],
                                toks[alias_col], toks[5], toks[8], toks[9],
                                toks[6])

                # This column only gets filled in for certain organisms.
                if (not systematic_name) or (systematic_name == '-'):
                    systematic_name = standard_name
                # Gene is actually mitochondrial, change symbol to avoid
                # duplicates (analogous to what GeneCards does).
                if chromosome == "MT":
                    if not systematic_name.startswith('MT'):
                        logger.debug(
                            "Renaming %s to %s, mitochondrial version",
                            systematic_name, "MT-" + systematic_name)
                        systematic_name = "MT-" + systematic_name

                alias_str = ""
                alias_num = 0
                if aliases and (aliases != '-'):
                    alias_list = [unicode(x) for x in aliases.split('|')]
                    alias_num = len(alias_list)
                    alias_str = ' '.join(alias_list)

                # Handle cross references.
                xref_tuples = []
                if crossrefs and (crossrefs != '-'):
                    xref_tuples = set()
                    if (systematic_xrdb):
                        xref_tuples.add((unicode(systematic_xrdb),
                                         unicode(systematic_name)))

                    xrefs = [unicode(x) for x in crossrefs.split('|')]
                    for x in xrefs:
                        xref_tuples.add(tuple(x.split(':')))

                xref_num = len(xref_tuples)
                # Arbitrary weight for search results.
                # The principle of weighting is that we think people are more
                # likely to want a gene that occurs in more databases or has
                # more aliases b/c it is better-known.  This helps break
                # ordering ties where gene names are identical.
                weight = 2 * xref_num + alias_num

                # We also assume that people are much more likely to want
                # protein coding genes.  In the long term we could measure
                # actual selections and estimate weight per gene.
                if status == 'protein-coding':
                    weight = weight * 2

                gene_object = None
                entrez_seen.add(entrezid)
                if entrezid in entrez_in_db:  # This existed already.
                    logger.debug("Entrez %s existed already.", entrezid)
                    entrez_found += 1
                    gene_object = Gene.objects.get(entrezid=entrezid,
                                                   organism=org)
                    changed = False
                    # The following lines update characteristics that may have
                    # changed.
                    if gene_object.systematic_name != systematic_name:
                        gene_object.systematic_name = systematic_name
                        changed = True
                    if gene_object.standard_name != standard_name:
                        gene_object.standard_name = standard_name
                        changed = True
                    if gene_object.description != description:
                        gene_object.description = description
                        changed = True
                    if gene_object.aliases != alias_str:
                        gene_object.aliases = alias_str
                        changed = True
                    if gene_object.weight != weight:
                        gene_object.weight = weight
                        changed = True
                    # If the gene was marked obsolete but occurs in the
                    # gene_info file, then it's not obsolete.
                    if gene_object.obsolete:
                        gene_object.obsolete = False
                        changed = True
                    if changed:
                        entrez_updated += 1
                        # To save time, only call save() if something has been
                        # changed.
                        gene_object.save()

                else:  # New entrezid observed.
                    logger.debug(
                        "Entrez %s did not exist and will be created.",
                        entrezid)
                    gene_object = Gene(entrezid=entrezid,
                                       organism=org,
                                       systematic_name=systematic_name,
                                       standard_name=standard_name,
                                       description=description,
                                       obsolete=False,
                                       weight=weight)
                    gene_object.save()
                    entrez_created += 1

                # Add crossreferences.
                for xref_tuple in xref_tuples:
                    try:
                        xrdb = xrdb_cache[xref_tuple[0]]
                    except KeyError:
                        try:
                            xrdb = CrossRefDB.objects.get(name=xref_tuple[0])
                        except CrossRefDB.DoesNotExist:
                            xrdb = None
                        xrdb_cache[xref_tuple[0]] = xrdb
                    if xrdb is None:  # Don't understand crossrefdb, skip.
                        logger.warning(
                            "We encountered an xrdb (%s) not in our"
                            " database for pair %s.", xref_tuple[0],
                            xref_tuple)
                        continue
                    logger.debug('Found crossreference pair %s.', xref_tuple)
                    # If the record doesn't exist in database, create it.
                    if not (xref_tuple[0], xref_tuple[1],
                            entrezid) in xr_in_db:
                        xr_obj = CrossRef(crossrefdb=xrdb,
                                          xrid=xref_tuple[1],
                                          gene=gene_object)
                        xr_obj.save()

            # Update "obsolete" attribute for entrez records that are in the
            # database but not in input file.
            for id in entrez_in_db:
                if id not in entrez_seen:
                    gene_object = Gene.objects.get(entrezid=id, organism=org)
                    if not gene_object.obsolete:
                        gene_object.obsolete = True
                        gene_object.save()

            logger.info(
                "%s entrez identifiers existed in the database and "
                "were found in the new gene_info file", entrez_found)
            logger.info(
                "%s entrez identifiers existed in the database and "
                "were changed in the new gene_info file", entrez_updated)
            logger.info(
                "%s entrez identifiers did not exist and were created"
                "in the new gene_info file", entrez_created)
            if org_matches < 10:
                logger.error('Less than ten matches were encountered for '
                             'this organism.  Check the organism ID.')
                sys.exit(1)
        else:
            logger.error('Couldn\'t load geneinfo %s for org %s.',
                         options.get('geneinfo'),
                         tax_id,
                         exc_info=sys.exc_info(),
                         extra={'options': options})
Esempio n. 3
0
    def setUp(self):
        org = factory.create(Organism)
        xrdb1 = CrossRefDB(name="ASDF", url="http://www.example.com")
        xrdb1.save()
        xrdb2 = CrossRefDB(name="XRDB2", url="http://www.example.com/2")
        xrdb2.save()

        # g1 and g2 have both standard and systematic names.
        g1 = Gene(entrezid=1,
                  systematic_name="g1",
                  standard_name="G1",
                  description="asdf",
                  organism=org,
                  aliases="gee1 GEE1")
        g1.save()
        g2 = Gene(entrezid=2,
                  systematic_name="g2",
                  standard_name="G2",
                  description="asdf",
                  organism=org,
                  aliases="gee2 GEE2")
        g2.save()

        xref1 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRID1")
        xref1.save()
        xref2 = CrossRef(crossrefdb=xrdb2, gene=g2, xrid="XRID1")
        xref2.save()
        xref3 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRRID1")
        xref3.save()
        xref4 = CrossRef(crossrefdb=xrdb1, gene=g2, xrid="XRID2")
        xref4.save()

        org2 = Organism(taxonomy_id=1234,
                        common_name="Computer mouse",
                        scientific_name="Mus computurus",
                        slug="mus-computurus")
        org2.save()
        org3 = Organism(taxonomy_id=4321,
                        common_name="Computer screen",
                        scientific_name="Monitorus computurus",
                        slug="monitorus-computurus")
        org3.save()

        # Make systematic and standard name the same for the following genes,
        # but make organisms different. Skip entrezid 3 since that is used by
        # other tests.
        g4 = Gene(entrezid=4,
                  systematic_name="acdc",
                  standard_name="ACDC",
                  description="asdf",
                  organism=org2,
                  aliases="gee4 GEE4")
        g4.save()
        g5 = Gene(entrezid=5,
                  systematic_name="acdc",
                  standard_name="ACDC",
                  description="asdf",
                  organism=org3,
                  aliases="gee5 GEE5")
        g5.save()

        # g101 has standard name, but no systematic name.
        g101 = Gene(entrezid=101, standard_name="std_101", organism=org2)
        g101.save()

        # g102 has systematic name, but no standard name.
        g102 = Gene(entrezid=102, systematic_name="sys_102", organism=org2)
        g102.save()
Esempio n. 4
0
    def handle(self, *args, **options):
        uniprot_file = options.get('uniprot')
        if uniprot_file:

            uniprot = CrossRefDB.objects.get(name='UniProtKB')
            ensembl = CrossRefDB.objects.get(name='Ensembl')

            entrez_set = set(Gene.objects.all().values_list('entrezid',
                                                            flat=True))
            ensembl_set = set(
                CrossRef.objects.filter(crossrefdb=ensembl).values_list(
                    'xrid', flat=True))
            uniprot_file = open(uniprot_file)
            uniprot_entrez = {}
            uniprot_ensembl = {}

            for line in uniprot_file:
                (uniprot_id, id_type, identifier) = line.strip().split()

                if id_type == "GeneID":
                    # 'GeneID' is a mapping for entrez
                    entrez_id = int(identifier)
                    if entrez_id in entrez_set:
                        uniprot_entrez[uniprot_id] = entrez_id

                elif id_type == "Ensembl":
                    ensembl_id = identifier
                    if ensembl_id in ensembl_set:
                        uniprot_ensembl[uniprot_id] = ensembl_id

            for uniprot_id in uniprot_entrez.keys():
                gene = Gene.objects.get(entrezid=uniprot_entrez[uniprot_id])
                try:
                    uniprot_xr = CrossRef.objects.get(crossrefdb=uniprot,
                                                      xrid=uniprot_id)
                    uniprot_xr.gene = gene
                    uniprot_xr.save()
                except CrossRef.DoesNotExist:
                    uniprot_xr = CrossRef(crossrefdb=uniprot,
                                          xrid=uniprot_id,
                                          gene=gene)
                    uniprot_xr.save()

            uniprot_set = set(
                CrossRef.objects.filter(crossrefdb=uniprot).values_list(
                    'xrid', flat=True))

            for uniprot_id, ensembl_id in uniprot_ensembl.iteritems():
                if uniprot_id not in uniprot_set:
                    # If there is already a UniProt xref with this id in
                    # the database it means that it was already added using
                    # Entrez. We are only interested in uniprot_ids that
                    # haven't been added
                    ensembl_xr = CrossRef.objects.filter(crossrefdb=ensembl,
                                                         xrid=ensembl_id)[0]

                    gene = ensembl_xr.gene
                    uniprot_xr = CrossRef(crossrefdb=uniprot,
                                          xrid=uniprot_id,
                                          gene=gene)
                    uniprot_xr.save()

            uniprot_file.close()
        else:
            logger.error("Couldn\'t load uniprot %s",
                         options.get('uniprot'),
                         exc_info=sys.exc_info(),
                         extra={'options': options})