def index(): for row in _data: issns = [] if row[_ISSN]: issns.append(row[_ISSN]) if row[_EISSN]: issns.append(row[_EISSN]) jnames = [] if row[_TITLE]: jnames.append(row[_TITLE]) if row[_TITLE_ALT] and row[ _TITLE_ALT] not in jnames: jnames.append(row[_TITLE_ALT]) eissn = [] if row[_EISSN]: eissn.append(row[_EISSN]) publisher = [] if row[_PUBLISHER]: publisher.append(row[_PUBLISHER]) Journal.add(issn=issns, electronic_issn=eissn, journal_title=jnames, publisher_name=publisher)
def extract_data(filepath, pmidfile): fn = os.path.split(filepath)[1] # extract the xml from the zip z = zipfile.ZipFile(filepath) inf = z.filelist[0] f = z.open(inf) # parse the xml tree = cetree.parse(f) root = tree.getroot() pmids = [] records = [] for mlc in root.findall("MedlineCitation"): record = {"electronic_issn": [], "print_issn": [], "issn": [], "journal_title": [], "journal_abbr": []} has_data = False # record any pubmed ids that we encounter mypmid = None pmidels = mlc.findall("PMID") for e in pmidels: mypmid = e.text if e.text not in pmids: pmids.append(e.text) article = mlc.find("Article") journal = article.find("Journal") # record any ISSNs in their appropriate type field issnels = journal.findall("ISSN") for e in issnels: has_data = True if e.get("IssnType") == "Electronic": if e.text not in record["electronic_issn"]: record["electronic_issn"].append(e.text) if e.text in record["issn"]: record["issn"].remove(e.text) elif e.get("IssnType") == "Print": if e.text not in record["print_issn"]: record["print_issn"].append(e.text) if e.text in record["issn"]: record["issn"].remove(e.text) else: if e.text not in record["issn"]: record["issn"].append(e.text) # add journal titles titlels = journal.findall("Title") for e in titlels: has_data = True if e.text not in record["journal_title"]: record["journal_title"].append(e.text) # add journal abbreviations isoels = journal.findall("ISOAbbreviation") for e in isoels: has_data = True if e.text not in record["journal_abbr"]: record["journal_abbr"].append(e.text) info = mlc.find("MedlineJournalInfo") # record medline's version of the name (which may well be the iso abbreviation) taels = info.findall("MedlineTA") for e in taels: has_data = True if e.text not in record["journal_title"] and e.text not in record["journal_abbr"]: record["journal_abbr"].append(e.text) # record the medline knowledge of this other issn - we don't necessarily know what type it is links = info.findall("ISSNLinking") for e in links: has_data = True if ( e.text not in record["electronic_issn"] and e.text not in record["print_issn"] and e.text not in record["issn"] ): record["issn"].append(e.text) # now write it to the ACAT via catflap if has_data: jnames = record.get("journal_title", []) for a in record.get("journal_abbr", []): if a not in jnames: jnames.append(a) source = "medline " + fn if mypmid is not None: source += " " + mypmid Journal.add( issn=record.get("issn", []) + record.get("electronic_issn", []) + record.get("print_issn", []), journal_title=jnames, electronic_issn=record.get("electronic_issn", []), print_issn=record.get("print_issn", []), journal_abbreviation=record.get("journal_abbr", []), ) # save the pmids found in this file save_pmids(pmidfile, pmids)