def pubmed_parser(path_xml):
    ar = pp.parse_pubmed_xml(path_xml)
    if not isinstance(path_xml, str):
        path_xml.seek(0)
    paragraph_dicts = pp.parse_pubmed_paragraph(path_xml)
    paragraphs = []
    for p in paragraph_dicts:
        del (p['pmc'])
        del (p['pmid'])
        paragraphs.append(p)
    ar['paragraphs'] = paragraphs
    num(ar, 'publication_year')
    try:
        ar['publication_date'] = datetime.datetime.strptime(
            ar['publication_date'], "%d-%m-%Y")
    except ValueError:
        try:
            print(ar['publication_date'])
            # assume error in 'day' and retry with the first day of the month
            ar['publication_date'] = datetime.datetime.strptime(
                "01" + ar['publication_date'][2:], "%d-%m-%Y")
        except ValueError:
            # a workaround, until we have a robust parser
            ar['publication_date'] = datetime.datetime(2000, 1, 1)
    return ar
Exemple #2
0
    def update_entry(self, entry):  # ClinVar Variation Archive entry
        if 'InterpretedRecord' in entry:
            ir = entry['InterpretedRecord']
            if 'SimpleAllele' in ir:
                self.update_simpleallele(ir['SimpleAllele'])
            if 'RCVList' in ir:
                unifylistattribute(ir,
                                   "RCVList",
                                   "RCVAccession",
                                   renamelistto='rcv')
                for i, rcvacc in enumerate(ir['rcv']):
                    if isinstance(rcvacc, string_types):
                        ir['rcv'][i] = {'#text': rcvacc}
                    else:
                        unifylistattribute(rcvacc,
                                           "InterpretedConditionList",
                                           "InterpretedCondition",
                                           renamelistto='interpretedCondition')
                        for j, ic in enumerate(rcvacc['interpretedCondition']):
                            if isinstance(ic, string_types):
                                rcvacc['interpretedCondition'][j] = {
                                    '#text': ic
                                }
            unifylistattribute(ir,
                               "ClinicalAssertionList",
                               "ClinicalAssertion",
                               renamelistto='clinicalAssertion')
            for ca in ir['clinicalAssertion']:
                num(ca, 'ID')
                if "Interpretation" in ca:
                    self.update_date(ca)
                    self.update_comment(ca['Interpretation'])

                unifylistattribute(ca,
                                   "ObservedInList",
                                   "ObservedIn",
                                   renamelistto='observedIn')
                if 'SimpleAllele' in ca:
                    sa = ca['SimpleAllele']
                    self.update_simpleallele(sa)

                if 'Genotype' in ca:
                    self.update_genotype_haplotype(ca['Genotype'])
                if 'Haplotype' in ca:
                    self.update_genotype_haplotype(ca['Haplotype'])
                if 'TraitSet' in ca:
                    self.update_comment(ca['TraitSet'])

                self.update_comment(ca)
                for o in ca['observedIn']:
                    self.update_comment(o)
                    if not isinstance(o['Sample']['Species'], string_types):
                        o['Sample']['Species'] = o['Sample']['Species'][
                            '#text']
                    if 'TraitSet' in o:
                        self.update_comment(o['TraitSet'])
                    if 'ObservedData' in o:
                        self.update_comment(o['ObservedData'])
                    if 'Method' in o and 'ObsMethodAttribute' in o['Method']:
                        self.update_comment(o['Method']['ObsMethodAttribute'])
Exemple #3
0
 def update_entry(self, entry):
     del entry['id']
     num(entry, "protein_count", int)
     if 'abstract' in entry:
         import json
         entry['abstract'] = json.dumps(entry['abstract'])  # , indent=4)
     if 'taxonomy_distribution' in entry:
         unifylistattribute(entry, 'taxonomy_distribution', 'taxon_data')
         for i in entry['taxonomy_distribution']:
             num(i, "proteins_count", int)
 def read_and_index_articles_file(self, infile_):
     infile = str(infile_)
     print("Reading %s " % infile)
     if infile.endswith(".xml.gz"):
         f = gzip.open(infile, 'rb')
     elif infile.endswith(".xml"):
         f = open(infile, 'rb')
     else:
         print(
             "Ignoring '%s': filename does not end with '.xml' or '.xml.gz'"
             % infile)
         return
     articles = pp.parse_medline_xml(f)
     listattrs = [
         'authors', 'mesh_terms', 'publication_types', 'chemical_list',
         'keywords', 'references', 'affiliations'
     ]
     ids = set()
     deletedrecords, deletedpmids = list(), list()
     for i, ar in enumerate(articles):
         if ar['delete']:
             # DeleteCitation entries at the end of the xml archive files
             # are parsed to an object with field values set to float NaN
             deletedrecords.append(i)
             deletedpmids.append(ar['pmid'])
             continue
         try:
             num(ar, 'pmc')
         except ValueError:
             ar['pmc'] = 2000
         ar['_id'] = num(ar, 'pmid')
         ids.add(ar['_id'])
         try:
             ar['pubdate'] = datetime.datetime(int(ar['pubdate']), 1, 1)
         except ValueError:
             print(ar['pubdate'])
             ar['pubdate'] = datetime.datetime(2000, 1, 1)
         for listattr in listattrs:
             if len(ar[listattr]) == 0:
                 del ar[listattr]
             else:
                 spr = ';' if listattr in ['authors', 'references'
                                           ] else '; '
                 ar[listattr] = ar[listattr].split(spr)
     for i in reversed(deletedrecords):
         del articles[i]
     self.qry.deletepubmedids(deletedpmids)
     if self.db == "Elasticsearch":
         if not self.qry.checkpubmedidsindexed(list(ids)):
             self.es_index(articles)
         else:
             print("Records in %s looks have been indexed, skipping" %
                   infile)
     else:  # assume MongoDB
         self.mdb_index(articles)
def index_article(dbc, ar):
    num(ar, 'pmid')
    pmcid = num(ar, 'pmc')
    try:
        if dbc.db == "Elasticsearch":
            dbc.es.index(index=dbc.index, id=pmcid, body=ar)
        else:  # MongoDB
            spec = {"_id": pmcid}
            dbc.mdbi[dbc.mdbcollection].update(spec, ar, upsert=True)
    except Exception as e:
        print("error: %s" % e)
    del ar
Exemple #6
0
 def update_simpleallele(self, sa):
     self.update_synonyms(sa)
     self.update_comment(sa)
     unifylistattribute(sa,
                        "MolecularConsequenceList",
                        "MolecularConsequence",
                        renamelistto='molecularConsequence')
     if 'molecularConsequence' in sa:
         self.update_comment(sa['molecularConsequence'])
     if 'FunctionalConsequence' in sa:
         self.update_comment(sa['FunctionalConsequence'])
     num(sa, 'AlleleID')
     num(sa, 'VariationID')