def handle(self, *args, **options): database = CrossRefDB.objects.get(name=options.get('dbname')) wb_url = options.get('wburl') xrefs_gzip_fh = gzip.GzipFile( fileobj=StringIO(urllib2.urlopen(wb_url, timeout=5).read())) for line in xrefs_gzip_fh: toks = line.strip().split('\t') systematic = 'CELE_' + toks[0] wbid = toks[1] try: gene = Gene.objects.get(systematic_name=systematic) except Gene.DoesNotExist: logger.info("Unable to find gene %s.", systematic) continue wb = None try: wb = CrossRef.objects.get(xrid=wbid, crossrefdb=database) except CrossRef.DoesNotExist: wb = CrossRef(xrid=wbid, crossrefdb=database) wb.gene = gene wb.save() xrefs_gzip_fh.close()
def handle(self, *args, **options): # Load the organism. tax_id = options.get('taxonomy_id') org = Organism.objects.get(taxonomy_id=tax_id) # geneinfo file information. geneinfo_filename = options.get('geneinfo') symb_col = int(options.get('symbol_col')) syst_col = int(options.get('systematic_col')) alias_col = int(options.get('alias_col')) systematic_xrdb = options.get('systematic_xrdb') # Open the geneinfo file. if geneinfo_filename: geneinfo_fh = open(geneinfo_filename) # yeast has a taxonomy_id that changed, in this case when we look at # the id from NCBI we have to use the new one. gi_tax_id = tax_id if options.get('gi_tax_id'): gi_tax_id = options.get('gi_tax_id') # Get all genes for this organism from the database. entrez_in_db = set( Gene.objects.filter(organism=org).values_list('entrezid', flat=True)) # Get all cross reference pairs that refer to a gene from this # organism. xr_in_db = set() for x in CrossRef.objects.filter( gene__entrezid__in=entrez_in_db).prefetch_related( 'crossrefdb', 'gene'): xr_in_db.add((x.crossrefdb.name, x.xrid, x.gene.entrezid)) if tax_id and geneinfo_fh: # Store all the genes seen thus far so we can remove obsolete # entries. entrez_seen = set() # Store all the crossref pairs seen thus far to avoid duplicates. # Cache of cross reference databases, which saves hits to DB. xrdb_cache = {} # Check to make sure the organism matched so that we don't mass- # delete for no reason. org_matches = 0 entrez_found = 0 # Found from before. entrez_updated = 0 # Found from before and updated. entrez_created = 0 # Didn't exist, added. for line in geneinfo_fh: toks = line.strip().split('\t') if toks[symb_col] == "NEWENTRY": logger.info("NEWENTRY line skipped") continue if not (toks[0] == gi_tax_id): # From wrong organism, skip. continue org_matches += 1 # Count lines that came from this organism. # Grab requested fields from tab delimited file. (entrezid, standard_name, systematic_name, aliases, crossrefs, description, status, chromosome) = (int(toks[1]), toks[symb_col], toks[syst_col], toks[alias_col], toks[5], toks[8], toks[9], toks[6]) # This column only gets filled in for certain organisms. if (not systematic_name) or (systematic_name == '-'): systematic_name = standard_name # Gene is actually mitochondrial, change symbol to avoid # duplicates (analogous to what GeneCards does). if chromosome == "MT": if not systematic_name.startswith('MT'): logger.debug( "Renaming %s to %s, mitochondrial version", systematic_name, "MT-" + systematic_name) systematic_name = "MT-" + systematic_name alias_str = "" alias_num = 0 if aliases and (aliases != '-'): alias_list = [unicode(x) for x in aliases.split('|')] alias_num = len(alias_list) alias_str = ' '.join(alias_list) # Handle cross references. xref_tuples = [] if crossrefs and (crossrefs != '-'): xref_tuples = set() if (systematic_xrdb): xref_tuples.add((unicode(systematic_xrdb), unicode(systematic_name))) xrefs = [unicode(x) for x in crossrefs.split('|')] for x in xrefs: xref_tuples.add(tuple(x.split(':'))) xref_num = len(xref_tuples) # Arbitrary weight for search results. # The principle of weighting is that we think people are more # likely to want a gene that occurs in more databases or has # more aliases b/c it is better-known. This helps break # ordering ties where gene names are identical. weight = 2 * xref_num + alias_num # We also assume that people are much more likely to want # protein coding genes. In the long term we could measure # actual selections and estimate weight per gene. if status == 'protein-coding': weight = weight * 2 gene_object = None entrez_seen.add(entrezid) if entrezid in entrez_in_db: # This existed already. logger.debug("Entrez %s existed already.", entrezid) entrez_found += 1 gene_object = Gene.objects.get(entrezid=entrezid, organism=org) changed = False # The following lines update characteristics that may have # changed. if gene_object.systematic_name != systematic_name: gene_object.systematic_name = systematic_name changed = True if gene_object.standard_name != standard_name: gene_object.standard_name = standard_name changed = True if gene_object.description != description: gene_object.description = description changed = True if gene_object.aliases != alias_str: gene_object.aliases = alias_str changed = True if gene_object.weight != weight: gene_object.weight = weight changed = True # If the gene was marked obsolete but occurs in the # gene_info file, then it's not obsolete. if gene_object.obsolete: gene_object.obsolete = False changed = True if changed: entrez_updated += 1 # To save time, only call save() if something has been # changed. gene_object.save() else: # New entrezid observed. logger.debug( "Entrez %s did not exist and will be created.", entrezid) gene_object = Gene(entrezid=entrezid, organism=org, systematic_name=systematic_name, standard_name=standard_name, description=description, obsolete=False, weight=weight) gene_object.save() entrez_created += 1 # Add crossreferences. for xref_tuple in xref_tuples: try: xrdb = xrdb_cache[xref_tuple[0]] except KeyError: try: xrdb = CrossRefDB.objects.get(name=xref_tuple[0]) except CrossRefDB.DoesNotExist: xrdb = None xrdb_cache[xref_tuple[0]] = xrdb if xrdb is None: # Don't understand crossrefdb, skip. logger.warning( "We encountered an xrdb (%s) not in our" " database for pair %s.", xref_tuple[0], xref_tuple) continue logger.debug('Found crossreference pair %s.', xref_tuple) # If the record doesn't exist in database, create it. if not (xref_tuple[0], xref_tuple[1], entrezid) in xr_in_db: xr_obj = CrossRef(crossrefdb=xrdb, xrid=xref_tuple[1], gene=gene_object) xr_obj.save() # Update "obsolete" attribute for entrez records that are in the # database but not in input file. for id in entrez_in_db: if id not in entrez_seen: gene_object = Gene.objects.get(entrezid=id, organism=org) if not gene_object.obsolete: gene_object.obsolete = True gene_object.save() logger.info( "%s entrez identifiers existed in the database and " "were found in the new gene_info file", entrez_found) logger.info( "%s entrez identifiers existed in the database and " "were changed in the new gene_info file", entrez_updated) logger.info( "%s entrez identifiers did not exist and were created" "in the new gene_info file", entrez_created) if org_matches < 10: logger.error('Less than ten matches were encountered for ' 'this organism. Check the organism ID.') sys.exit(1) else: logger.error('Couldn\'t load geneinfo %s for org %s.', options.get('geneinfo'), tax_id, exc_info=sys.exc_info(), extra={'options': options})
def setUp(self): org = factory.create(Organism) xrdb1 = CrossRefDB(name="ASDF", url="http://www.example.com") xrdb1.save() xrdb2 = CrossRefDB(name="XRDB2", url="http://www.example.com/2") xrdb2.save() # g1 and g2 have both standard and systematic names. g1 = Gene(entrezid=1, systematic_name="g1", standard_name="G1", description="asdf", organism=org, aliases="gee1 GEE1") g1.save() g2 = Gene(entrezid=2, systematic_name="g2", standard_name="G2", description="asdf", organism=org, aliases="gee2 GEE2") g2.save() xref1 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRID1") xref1.save() xref2 = CrossRef(crossrefdb=xrdb2, gene=g2, xrid="XRID1") xref2.save() xref3 = CrossRef(crossrefdb=xrdb1, gene=g1, xrid="XRRID1") xref3.save() xref4 = CrossRef(crossrefdb=xrdb1, gene=g2, xrid="XRID2") xref4.save() org2 = Organism(taxonomy_id=1234, common_name="Computer mouse", scientific_name="Mus computurus", slug="mus-computurus") org2.save() org3 = Organism(taxonomy_id=4321, common_name="Computer screen", scientific_name="Monitorus computurus", slug="monitorus-computurus") org3.save() # Make systematic and standard name the same for the following genes, # but make organisms different. Skip entrezid 3 since that is used by # other tests. g4 = Gene(entrezid=4, systematic_name="acdc", standard_name="ACDC", description="asdf", organism=org2, aliases="gee4 GEE4") g4.save() g5 = Gene(entrezid=5, systematic_name="acdc", standard_name="ACDC", description="asdf", organism=org3, aliases="gee5 GEE5") g5.save() # g101 has standard name, but no systematic name. g101 = Gene(entrezid=101, standard_name="std_101", organism=org2) g101.save() # g102 has systematic name, but no standard name. g102 = Gene(entrezid=102, systematic_name="sys_102", organism=org2) g102.save()
def handle(self, *args, **options): uniprot_file = options.get('uniprot') if uniprot_file: uniprot = CrossRefDB.objects.get(name='UniProtKB') ensembl = CrossRefDB.objects.get(name='Ensembl') entrez_set = set(Gene.objects.all().values_list('entrezid', flat=True)) ensembl_set = set( CrossRef.objects.filter(crossrefdb=ensembl).values_list( 'xrid', flat=True)) uniprot_file = open(uniprot_file) uniprot_entrez = {} uniprot_ensembl = {} for line in uniprot_file: (uniprot_id, id_type, identifier) = line.strip().split() if id_type == "GeneID": # 'GeneID' is a mapping for entrez entrez_id = int(identifier) if entrez_id in entrez_set: uniprot_entrez[uniprot_id] = entrez_id elif id_type == "Ensembl": ensembl_id = identifier if ensembl_id in ensembl_set: uniprot_ensembl[uniprot_id] = ensembl_id for uniprot_id in uniprot_entrez.keys(): gene = Gene.objects.get(entrezid=uniprot_entrez[uniprot_id]) try: uniprot_xr = CrossRef.objects.get(crossrefdb=uniprot, xrid=uniprot_id) uniprot_xr.gene = gene uniprot_xr.save() except CrossRef.DoesNotExist: uniprot_xr = CrossRef(crossrefdb=uniprot, xrid=uniprot_id, gene=gene) uniprot_xr.save() uniprot_set = set( CrossRef.objects.filter(crossrefdb=uniprot).values_list( 'xrid', flat=True)) for uniprot_id, ensembl_id in uniprot_ensembl.iteritems(): if uniprot_id not in uniprot_set: # If there is already a UniProt xref with this id in # the database it means that it was already added using # Entrez. We are only interested in uniprot_ids that # haven't been added ensembl_xr = CrossRef.objects.filter(crossrefdb=ensembl, xrid=ensembl_id)[0] gene = ensembl_xr.gene uniprot_xr = CrossRef(crossrefdb=uniprot, xrid=uniprot_id, gene=gene) uniprot_xr.save() uniprot_file.close() else: logger.error("Couldn\'t load uniprot %s", options.get('uniprot'), exc_info=sys.exc_info(), extra={'options': options})