def test_force_update_pmid_exists(self): """ force_update should cause existing data to be overwritten. """ initial = { 'pmid': 1, 'title': 'ASDF', } pub = factory.create(Publication, initial) pub.save() self.assertEqual(pub.title, initial['title']) load_pmids([ '1', ], force_update=True) #First Pub pub1 = Publication.objects.get(pmid=1) self.assertEqual( pub1.title, 'Formate assay in body fluids: application in methanol poisoning.') self.assertEqual(pub1.date, date(1975, 6, 1)) self.assertEqual(pub1.authors, 'Makar AB, McMartin KE, Palese M, Tephly TR') self.assertEqual(pub1.journal, 'Biochem Med') self.assertEqual(pub1.volume, '13') self.assertEqual(pub1.issue, '2') self.assertEqual(pub1.pages, '117-26') self.assertEqual(pub1.id, pub.id) # make sure primary key doesn't change
def test_get_multiple_pmid(self): """ load_pmids should be able to get multiple publications' information successfully from pubmed's efetch """ load_pmids(['1', '2']) #First Pub pub1 = Publication.objects.get(pmid=1) self.assertEqual( pub1.title, 'Formate assay in body fluids: application in methanol poisoning.') self.assertEqual(pub1.date, date(1975, 6, 1)) self.assertEqual(pub1.authors, 'Makar AB, McMartin KE, Palese M, Tephly TR') self.assertEqual(pub1.journal, 'Biochem Med') self.assertEqual(pub1.volume, '13') self.assertEqual(pub1.issue, '2') self.assertEqual(pub1.pages, '117-26') #Second Pub pub2 = Publication.objects.get(pmid=2) self.assertEqual( pub2.title, 'Delineation of the intimate details of the backbone conformation of pyridine nucleotide coenzymes in aqueous solution.' ) self.assertEqual(pub2.date, date(1975, 10, 27)) self.assertEqual(pub2.authors, 'Bose KS, Sarma RH') self.assertEqual(pub2.journal, 'Biochem Biophys Res Commun') self.assertEqual(pub2.volume, '66') self.assertEqual(pub2.issue, '4') self.assertEqual(pub2.pages, '1173-9')
def test_can_load_when_issue_volume_pages_are_null(self): """ Some PMIDs lack an issue (e.g. 9371713), volume, or pages. This test makes sure that these can still be loaded. """ load_pmids([ '9371713', ]) pub = Publication.objects.get(pmid=9371713) self.assertIsNone(pub.issue)
def test_error_pub_id(self): """ A publication that does not exist should emit a warning and nothing should be created. """ load_pmids([ '2764472319', ]) with self.assertRaises(Publication.DoesNotExist): Publication.objects.get(pmid=2764472319)
def test_load_pub_already_exists(self): """ Loading a publication that already exists should do nothing when force_update is not passed. """ initial = { 'pmid': 1, 'title': 'ASDF', } pub = factory.create(Publication, initial) pub.save() self.assertEqual(pub.title, initial['title']) load_pmids([ '1', ]) self.assertEqual(pub.title, initial['title'])
def test_get_single_pmid(self): """ load_pmids should be able to get a single publication's information successfully from pubmed's efetch """ load_pmids([ '1', ]) pub = Publication.objects.get(pmid=1) self.assertEqual( pub.title, 'Formate assay in body fluids: application in methanol poisoning.') self.assertEqual(pub.date, date(1975, 6, 1)) self.assertEqual(pub.authors, 'Makar AB, McMartin KE, Palese M, Tephly TR') self.assertEqual(pub.journal, 'Biochem Med') self.assertEqual(pub.volume, '13') self.assertEqual(pub.issue, '2') self.assertEqual(pub.pages, '117-26')
def test_force_update_pmid_doesnt_exist(self): """ force_update should cause existing data to be overwritten. """ load_pmids([ '1', ], force_update=True) #First Pub pub1 = Publication.objects.get(pmid=1) self.assertEqual( pub1.title, 'Formate assay in body fluids: application in methanol poisoning.') self.assertEqual(pub1.date, date(1975, 6, 1)) self.assertEqual(pub1.authors, 'Makar AB, McMartin KE, Palese M, Tephly TR') self.assertEqual(pub1.journal, 'Biochem Med') self.assertEqual(pub1.volume, '13') self.assertEqual(pub1.issue, '2') self.assertEqual(pub1.pages, '117-26')
def format_annotations(self, annots, xrdb, full_pubs, organism=None): """ xrdb is the type of gene identifier that the annotations are sent as """ formatted_for_db_annotations = set() genes_not_found = set() multiple_genes_found = set() pubs_not_loaded = set() annotation_dict = {} if organism is not None: gene_objects_manager = Gene.objects.filter( organism__scientific_name=organism) else: gene_objects_manager = Gene.objects for key in annots: # This loop validates the annotations and gets the actual # gene/publication objects try: if (xrdb is None): gene_obj = gene_objects_manager.get(id=key) elif (xrdb == 'Entrez'): gene_obj = gene_objects_manager.get(entrezid=key) elif (xrdb == 'Symbol'): try: gene_obj = gene_objects_manager.get(standard_name=key) except Gene.DoesNotExist: gene_obj = gene_objects_manager.get( systematic_name=key) else: xref_obj = CrossRef.objects.filter( crossrefdb__name=xrdb).get(xrid=key) gene_obj = xref_obj.gene pubs = set() for publication in annots[key]: if full_pubs: # The full publication database objects were sent pubs.add(publication['id']) else: # Only the pubmed IDs were sent pubmed_id = publication try: # Check to see if publication is in the database pub_obj = Publication.objects.get(pmid=pubmed_id) except Publication.DoesNotExist: # If it doesn't exist in the database, load it logger.info( "Pubmed ID %s did not exist in the " "database. Loading it now.", pubmed_id) load_pmids([ pubmed_id, ]) try: # Try again to see if publication is now in # the database pub_obj = Publication.objects.get( pmid=pubmed_id) except Publication.DoesNotExist: # Pubmed id that was passed probably does not # exist logger.warning( "Pubmed ID %s could not be " "loaded from Pubmed server. " "Saving it in version as None.", pubmed_id) pubs_not_loaded.add(pubmed_id) pub_obj = None if pub_obj: pubs.add(pub_obj.id) annotation_dict[gene_obj.pk] = pubs except (Gene.DoesNotExist, CrossRef.DoesNotExist): genes_not_found.add(key) except (Gene.MultipleObjectsReturned): multiple_genes_found.add(key) if annotation_dict: # if annotations (genes and publications) exist in the database: for key in annotation_dict: # The following statement is the pythonic way to check if the # set is not empty (i.e. there are publications for this gene) if annotation_dict[key]: # There are publications for this gene - add them as tuples # to formatted_for_db_annotations set. for pub in annotation_dict[key]: formatted_for_db_annotations.add((key, pub)) else: # There are no pubs for this gene formatted_for_db_annotations.add((key, None)) formatted_for_db_annotations = frozenset( formatted_for_db_annotations) return (formatted_for_db_annotations, genes_not_found, pubs_not_loaded, multiple_genes_found)
def handle(self, *args, **options): user_name = options.get('user') user = None try: user = User.objects.get(username=user_name) except User.DoesNotExist: logger.error('The user %s did not exist.', user_name, extra={'options': options}) sys.exit() org = None try: org = Organism.objects.get(scientific_name=options.get('organism')) except Organism.DoesNotExist: logger.error('The organism %s did not exist.', options.get('organism'), extra={'options': options}) sys.exit() accepted_evcodes = None if options.get('evcodes'): accepted_evcodes = set(options.get('evcodes').split(',')) gene_ontology = go() remote = options.get('remote') != None obo_location = GO_OBO_URL if remote else options.get('obo') loaded_obo = gene_ontology.load_obo(obo_location, remote_location=remote, timeout=5) if not loaded_obo: logger.error("Couldn't load OBO file %s with remote equal to %s.", obo_location, remote) sys.exit() annot_zip_fh = None annot_fh = None if remote: annot_zip_fh = urllib2.urlopen(GO_ASSOC_FTP + '.'.join( (GO_ASSOC_PREFIX, GO_NAMES[org.scientific_name], GO_ASSOC_SUFFIX)), timeout=5) else: annot_zip_fh = open(options.get('annot')) annot_fh = gzip.GzipFile(fileobj=io.BytesIO(annot_zip_fh.read())) annot_zip_fh.close() annots = [] load_pairs = {} pubs = set() for line in annot_fh: if line.startswith('!'): continue toks = line.strip().split('\t') (xrdb, xrid, details, goid, ref, ev, date) = (toks[0], toks[1], toks[3], toks[4], toks[5], toks[6], toks[13]) if options.get('tair'): import re tair_regex = re.compile('AT[0-9MC]G[0-9][0-9][0-9][0-9][0-9]') first_alias = toks[10].split('|')[0] if tair_regex.match(toks[2]): xrid = toks[2] elif tair_regex.match(toks[9]): xrid = toks[9] elif tair_regex.match(first_alias): xrid = first_alias if options.get('only_wb') and (toks[0] != 'WB'): continue if details == 'NOT': continue if accepted_evcodes is not None and not (ev in accepted_evcodes): continue if options.get('leading') is not None: xrid = xrid.split(':')[1] try: load_pairs[xrdb].append(xrid) except KeyError: load_pairs[xrdb] = [ xrid, ] refs = ref.split('|') for ref_item in refs: if ref_item.startswith('PMID:'): pubs.add(ref_item.split(':')[1]) else: logger.info("Unknown publication key %s", ref_item) annots.append((xrdb, xrid, goid, ref, date)) xref_cache = {} if options.get('pseudomonas'): logger.info('Pseudomonas entered') for (xrdb, xrids) in load_pairs.iteritems(): gene_objs = Gene.objects.filter(systematic_name__in=xrids) logger.info( "Mapped %s Pseudomonas genes from the database using gene systematic name.", gene_objs.count()) for gene_obj in gene_objs: xref_cache[(xrdb, gene_obj.systematic_name)] = gene_obj else: for (xrdb, xrids) in load_pairs.iteritems(): if xrdb in DB_REMAP: xrdb = DB_REMAP[xrdb] try: xrdb_obj = CrossRefDB.objects.get(name=xrdb) except CrossRefDB.DoesNotExist: logger.warning("Couldn't find the cross reference DB %s.", xrdb) continue xrid_objs = CrossRef.objects.filter( crossrefdb=xrdb_obj).filter(xrid__in=xrids) logger.info("Mapped %s cross references from %s", xrid_objs.count(), xrdb) for xrid_obj in xrid_objs: xref_cache[(xrdb, xrid_obj.xrid)] = xrid_obj.gene load_pmids(pubs) pub_cache = {} pub_values = Publication.objects.filter(pmid__in=pubs).only( 'id', 'pmid').values() for pub in pub_values: pub_cache[pub['pmid']] = pub['id'] for annot in annots: (xrdb, xrid, goid, ref, date) = annot if xrdb in DB_REMAP: xrdb = DB_REMAP[xrdb] try: gene = xref_cache[(xrdb, xrid)] except KeyError: logger.debug("Couldn't find xrid %s in xrdb %s.", xrid, xrdb) logger.info("Couldn't find xrid %s in xrdb %s.", xrid, xrdb) continue refs = ref.split('|') pub = None for ref_item in refs: if ref_item.startswith('PMID:'): try: pub = pub_cache[int(ref_item.split(':')[1])] except KeyError: pub = None gene_ontology.add_annotation(go_id=goid, gid=gene.pk, ref=pub, date=date, direct=True) gene_ontology.populated = True #mark annotated gene_ontology.propagate() #prop annotations evlist = list(accepted_evcodes) for (term_id, term) in gene_ontology.go_terms.iteritems(): if term.annotations: slug = slugify(' '.join( (term.go_id, org.scientific_name, term.full_name)))[:50] #make first 50 chars into a slug namespace = GO_NAMESPACE_MAP[term.get_namespace()] go_id = term.go_id.split(':')[1] #construct title title = 'GO' + '-' + namespace + '-' + go_id + ':' + term.full_name #construct abstract #write evidence as string evclause = '' if len(evlist): evclause = ' Only annotations with evidence coded as ' if len(evlist) == 1: evclause = evclause + evlist[0] else: evclause = evclause + ', '.join( evlist[:-1]) + ' or ' + evlist[-1] evclause = evclause + ' are included.' if term.description: description = term.description + ' Annotations are propagated through transitive closure as recommended by the GO Consortium.' + evclause else: logger.info("No description on term %s", term) #get geneset changed = False try: gs_obj = Geneset.objects.get(slug=slug, creator=user) changed = False #flag to know if we need to call save #all these genesets should be public if not gs_obj.public: gs_obj.public = True changed = True if gs_obj.title != title: gs_obj.title = title changed = True if gs_obj.abstract != description: gs_obj.abstract = description changed = True except Geneset.DoesNotExist: gs_obj = Geneset(title=title, slug=slug, creator=user, organism=org, public=True, abstract=description) changed = True #if anything changed if changed: gs_obj.save() if options.get('initial'): #disable commit field's auto_now_add, allows us to set a prior annotation date commit_date = Version._meta.get_field_by_name( 'commit_date')[0] commit_date.auto_now_add = False logger.info( 'Initial load. Need to construct versions of %s from annotation date.', term.go_id) date_annots = {} for annotation in term.annotations: date = timezone.make_aware( datetime.strptime(annotation.date, '%Y%m%d'), timezone.get_default_timezone()) try: date_annots[date].append(annotation) except KeyError: date_annots[date] = [ annotation, ] annots_as_of_date = set() prior_annots = set() prior_version = None for (date, annots) in sorted(date_annots.iteritems()): annots_as_of_date.update([(annotation.gid, annotation.ref) for annotation in annots]) if (annots_as_of_date == prior_annots ): #if nothing changed, continue continue v_obj = Version(geneset=gs_obj, creator=user, parent=prior_version, commit_date=date) v_obj.description = "Added " + str( len(annots) ) + " annotations from GO based on the dates provided in the GO annotation file." v_obj.annotations = annots_as_of_date v_obj.save() prior_version = v_obj prior_annots = annots_as_of_date.copy() #re-enable auto_now_add commit_date.auto_now_add = True else: #load annotations most_recent_versions = Version.objects.filter( geneset=gs_obj).order_by('-commit_date')[:1] annots = set([(annotation.gid, annotation.ref) for annotation in term.annotations]) description = '' most_recent_version = None if most_recent_versions: most_recent_version = most_recent_versions[0] if (most_recent_version.commit_date > timezone.now()): logger.error('Version from the future: %s.', most_recent_version) new = annots - most_recent_version.annotations removed = most_recent_version.annotations - annots if (new or removed): description = description + 'Added ' + str( len(new)) + ' and removed ' + str( len(removed)) + ' annotations from GO.' else: description = 'Created with ' + str( len(annots)) + ' annotations from GO.' if description: v_obj = Version(geneset=gs_obj, creator=user, parent=most_recent_version, commit_date=timezone.now()) v_obj.description = description v_obj.annotations = annots v_obj.save()