def pubmed_parser(path_xml): ar = pp.parse_pubmed_xml(path_xml) if not isinstance(path_xml, str): path_xml.seek(0) paragraph_dicts = pp.parse_pubmed_paragraph(path_xml) paragraphs = [] for p in paragraph_dicts: del (p['pmc']) del (p['pmid']) paragraphs.append(p) ar['paragraphs'] = paragraphs num(ar, 'publication_year') try: ar['publication_date'] = datetime.datetime.strptime( ar['publication_date'], "%d-%m-%Y") except ValueError: try: print(ar['publication_date']) # assume error in 'day' and retry with the first day of the month ar['publication_date'] = datetime.datetime.strptime( "01" + ar['publication_date'][2:], "%d-%m-%Y") except ValueError: # a workaround, until we have a robust parser ar['publication_date'] = datetime.datetime(2000, 1, 1) return ar
def update_entry(self, entry): # ClinVar Variation Archive entry if 'InterpretedRecord' in entry: ir = entry['InterpretedRecord'] if 'SimpleAllele' in ir: self.update_simpleallele(ir['SimpleAllele']) if 'RCVList' in ir: unifylistattribute(ir, "RCVList", "RCVAccession", renamelistto='rcv') for i, rcvacc in enumerate(ir['rcv']): if isinstance(rcvacc, string_types): ir['rcv'][i] = {'#text': rcvacc} else: unifylistattribute(rcvacc, "InterpretedConditionList", "InterpretedCondition", renamelistto='interpretedCondition') for j, ic in enumerate(rcvacc['interpretedCondition']): if isinstance(ic, string_types): rcvacc['interpretedCondition'][j] = { '#text': ic } unifylistattribute(ir, "ClinicalAssertionList", "ClinicalAssertion", renamelistto='clinicalAssertion') for ca in ir['clinicalAssertion']: num(ca, 'ID') if "Interpretation" in ca: self.update_date(ca) self.update_comment(ca['Interpretation']) unifylistattribute(ca, "ObservedInList", "ObservedIn", renamelistto='observedIn') if 'SimpleAllele' in ca: sa = ca['SimpleAllele'] self.update_simpleallele(sa) if 'Genotype' in ca: self.update_genotype_haplotype(ca['Genotype']) if 'Haplotype' in ca: self.update_genotype_haplotype(ca['Haplotype']) if 'TraitSet' in ca: self.update_comment(ca['TraitSet']) self.update_comment(ca) for o in ca['observedIn']: self.update_comment(o) if not isinstance(o['Sample']['Species'], string_types): o['Sample']['Species'] = o['Sample']['Species'][ '#text'] if 'TraitSet' in o: self.update_comment(o['TraitSet']) if 'ObservedData' in o: self.update_comment(o['ObservedData']) if 'Method' in o and 'ObsMethodAttribute' in o['Method']: self.update_comment(o['Method']['ObsMethodAttribute'])
def update_entry(self, entry): del entry['id'] num(entry, "protein_count", int) if 'abstract' in entry: import json entry['abstract'] = json.dumps(entry['abstract']) # , indent=4) if 'taxonomy_distribution' in entry: unifylistattribute(entry, 'taxonomy_distribution', 'taxon_data') for i in entry['taxonomy_distribution']: num(i, "proteins_count", int)
def read_and_index_articles_file(self, infile_): infile = str(infile_) print("Reading %s " % infile) if infile.endswith(".xml.gz"): f = gzip.open(infile, 'rb') elif infile.endswith(".xml"): f = open(infile, 'rb') else: print( "Ignoring '%s': filename does not end with '.xml' or '.xml.gz'" % infile) return articles = pp.parse_medline_xml(f) listattrs = [ 'authors', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'affiliations' ] ids = set() deletedrecords, deletedpmids = list(), list() for i, ar in enumerate(articles): if ar['delete']: # DeleteCitation entries at the end of the xml archive files # are parsed to an object with field values set to float NaN deletedrecords.append(i) deletedpmids.append(ar['pmid']) continue try: num(ar, 'pmc') except ValueError: ar['pmc'] = 2000 ar['_id'] = num(ar, 'pmid') ids.add(ar['_id']) try: ar['pubdate'] = datetime.datetime(int(ar['pubdate']), 1, 1) except ValueError: print(ar['pubdate']) ar['pubdate'] = datetime.datetime(2000, 1, 1) for listattr in listattrs: if len(ar[listattr]) == 0: del ar[listattr] else: spr = ';' if listattr in ['authors', 'references' ] else '; ' ar[listattr] = ar[listattr].split(spr) for i in reversed(deletedrecords): del articles[i] self.qry.deletepubmedids(deletedpmids) if self.db == "Elasticsearch": if not self.qry.checkpubmedidsindexed(list(ids)): self.es_index(articles) else: print("Records in %s looks have been indexed, skipping" % infile) else: # assume MongoDB self.mdb_index(articles)
def index_article(dbc, ar): num(ar, 'pmid') pmcid = num(ar, 'pmc') try: if dbc.db == "Elasticsearch": dbc.es.index(index=dbc.index, id=pmcid, body=ar) else: # MongoDB spec = {"_id": pmcid} dbc.mdbi[dbc.mdbcollection].update(spec, ar, upsert=True) except Exception as e: print("error: %s" % e) del ar
def update_simpleallele(self, sa): self.update_synonyms(sa) self.update_comment(sa) unifylistattribute(sa, "MolecularConsequenceList", "MolecularConsequence", renamelistto='molecularConsequence') if 'molecularConsequence' in sa: self.update_comment(sa['molecularConsequence']) if 'FunctionalConsequence' in sa: self.update_comment(sa['FunctionalConsequence']) num(sa, 'AlleleID') num(sa, 'VariationID')