def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ce:bib-reference"): label = get_value_in_tag(reference, "ce:label") authors = [] for author in reference.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi = get_value_in_tag(reference, "ce:doi") issue = get_value_in_tag(reference, "sb:issue") page = get_value_in_tag(reference, "sb:first-page") title = get_value_in_tag(reference, "sb:maintitle") volume = get_value_in_tag(reference, "sb:volume-nr") tmp_issues = reference.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date")[:4] else: year = None textref = get_value_in_tag(reference, "ce:textref") ext_link = format_arxiv_id(self.get_ref_link(reference, 'arxiv')) references.append((label, authors, doi, issue, page, title, volume, year, textref, ext_link)) return references
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("Author"): tmp = {} surname = get_value_in_tag(author, "FamilyName") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "GivenName") if given_name: tmp["given_name"] = given_name.replace('\n', ' ') # initials = get_value_in_tag(author, "ce:initials") # if initials: # tmp["initials"] = initials # It's not there # orcid = author.getAttribute('orcid').encode('utf-8') # if orcid: # tmp["orcid"] = orcid emails = author.getElementsByTagName("Email") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break # cross_refs = author.getElementsByTagName("ce:cross-ref") # if cross_refs: # tmp["cross_ref"] = [] # for cross_ref in cross_refs: # tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) tmp["affiliations_ids"] = [] aids = author.getAttribute("AffiliationIDS").split() for aid in aids: tmp["affiliations_ids"].append(aid.encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("Affiliation"): aff_id = affiliation.getAttribute("ID").encode('utf-8') text = xml_to_text(affiliation, delimiter=', ') affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("Citation"): if not reference.getElementsByTagName("BibArticle"): references.append((get_value_in_tag(reference, "BibUnstructured"), '', '', '', '', '', '', '')) else: label = get_value_in_tag(reference, "ArticleTitle") authors = [] for author in reference.getElementsByTagName("BibAuthorName"): given_name = get_value_in_tag(author, "Initials") surname = get_value_in_tag(author, "FamilyName") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi_tag = reference.getElementsByTagName("Occurrence") doi = "" for tag in doi_tag: if tag.getAttribute("Type") == "DOI": doi = xml_to_text(tag) ## What is it exactly? # issue = get_value_in_tag(reference, "sb:issue") issue = "" page = get_value_in_tag(reference, "FirstPage") title = get_value_in_tag(reference, "JournalTitle") volume = get_value_in_tag(reference, "VolumeID") year = get_value_in_tag(reference, "Year") references.append((label, authors, doi, issue, page, title, volume, year)) return references
def get_publication_date(self, xml): article_info = xml.getElementsByTagName("ArticleInfo")[0] article_history = article_info.getElementsByTagName("ArticleHistory")[0] online_date = article_history.getElementsByTagName("OnlineDate") if online_date: online_date = online_date[0] year = get_value_in_tag(online_date, "Year") month = get_value_in_tag(online_date, "Month") day = get_value_in_tag(online_date, "Day") try: return "%04d-%02d-%02d" % (int(year), int(month), int(day)) except Exception, err: print >> sys.stderr, "Can't reliably extract the publication date: %s" % err return ""
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("ce:author"): tmp = {} surname = get_value_in_tag(author, "ce:surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "ce:given-name") if given_name: tmp["given_name"] = given_name initials = get_value_in_tag(author, "ce:initials") if initials: tmp["initials"] = initials orcid = author.getAttribute('orcid').encode('utf-8') if orcid: tmp["orcid"] = orcid emails = author.getElementsByTagName("ce:e-address") for email in emails: if email.getAttribute("type").encode('utf-8') in ('email', ''): tmp["email"] = xml_to_text(email) break cross_refs = author.getElementsByTagName("ce:cross-ref") if cross_refs: tmp["cross_ref"] = [] for cross_ref in cross_refs: tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("ce:affiliation"): aff_id = affiliation.getAttribute("id").encode('utf-8') text = re.sub(r'^(\d+\ ?)', "", get_value_in_tag(affiliation, "ce:textfn")) affiliations[aff_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("cross_ref", []) if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_publication_information(self, xml): try: doi = get_value_in_tag(xml, "ArticleDOI") if not doi: raise ValueError("DOI not found") except Exception, err: print >> sys.stderr, "Can't find doi: %s" % err raise
def get_arxiv_id(self, xml): article_note = xml.getElementsByTagName('ArticleNote') if article_note: article_note = article_note[0] else: return "" arxiv_id = get_value_in_tag(article_note, "RefSource") if RE_ARXIV_ID.match(arxiv_id): return "arXiv:%s" % arxiv_id return ""
def get_publication_information(self, xml): jid = get_value_in_tag(xml, "journal-id") journal = "" #journal = CFG_ELSEVIER_JID_MAP.get(jid, jid) try: art = xml.getElementsByTagName('article-meta')[0] except IndexError, err: register_exception() print >> sys.stderr, "ERROR: XML corupted: %s" % err pass
def get_date(self, xml): dates = xml.getElementsByTagName('pub-date') ret = None for date in dates: if date.getAttribute('pub-type').encode('utf-8') == 'epub': ret = get_value_in_tag(date, 'year') if not ret and dates: return dates[0] else: return ret
def convert_record(record, response_date, request): header = record.getElementsByTagName("header")[0] oai_identifier = get_value_in_tag(header, "identifier") datestamp = get_value_in_tag(header, "datestamp") status = header.getAttribute("status").encode('utf8') rec = {} record_add_field(rec, tag="035", subfields=[ ('a', oai_identifier), ('u', request), ('9', 'Hindawi'), ('d', datestamp), ('h', response_date), ('m', 'marc21'), ('t', 'false') ]) new = True if find_records_from_extoaiid(oai_identifier, 'Hindawi'): new = False if status == 'deleted': if new: ## deleting a record we didn't have? Who cares :-) return None, True else: record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')]) return record_xml_output(rec), False for datafield in record.getElementsByTagName("datafield"): tag = datafield.getAttribute("tag").encode('utf-8') ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' ' ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' ' subfields = [] for subfield in datafield.getElementsByTagName("subfield"): code = subfield.getAttribute("code").encode('utf-8') value = xml_to_text(subfield) subfields.append((code, value)) record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields) return record_xml_output(rec), new
def _build_doi_mapping(self): self._dois = {} for path in self._found_issues: xml = parse(open(join(path, "resolved_issue.xml"))) jid = get_value_in_tag(xml, "jid") journal = CFG_ELSEVIER_JID_MAP.get(jid, jid) issn = get_value_in_tag(xml, "ce:issn") volume = get_value_in_tag(xml, "vol-first") issue = get_value_in_tag(xml, "iss-first") year = get_value_in_tag(xml, "start-date")[:4] start_date = get_value_in_tag(xml, "start-date") if len(start_date) is 8: start_date = time.strftime('%Y-%m-%d', time.strptime(start_date, '%Y%m%d')) elif len(start_date) is 6: start_date = time.strftime('%Y-%m', time.strptime(start_date, '%Y%m')) for included_item in xml.getElementsByTagName("ce:include-item"): doi = get_value_in_tag(included_item, "ce:doi") first_page = get_value_in_tag(included_item, "ce:first-page") last_page = get_value_in_tag(included_item, "ce:last-page") self._dois[doi] = (journal, issn, volume, issue, first_page, last_page, year, start_date)
def bibfilter(filename): print >> sys.stderr, "Parsing %s" % filename xml = get_xml(open(filename)) request = xml.getElementsByTagName("request")[0].toxml() response_date = get_value_in_tag(xml, "responseDate") new_records, updated_records = [], [] records = xml.getElementsByTagName("record") print >> sys.stderr, "Found %s records" % len(records) for record in records: marcxml, new = convert_record(record, response_date, request) if marcxml is None: continue if new: new_records.append(marcxml) else: updated_records.append(marcxml) create_record_file(filename + '.insert.xml', new_records) create_record_file(filename + '.correct.xml', updated_records)
def get_abstract(self, xml): try: return get_value_in_tag(xml.getElementsByTagName("ce:abstract-sec")[0], "ce:simple-para") except Exception, err: print >> sys.stderr, "Can't find abstract"
def get_title(self, xml): try: return get_value_in_tag(xml, "article-title") except Exception, err: print >> sys.stderr, "Can't find title"
def get_title(self, xml): try: return get_value_in_tag(xml, "ArticleTitle") except Exception, err: print >> sys.stderr, "Can't find title" def get_publication_information(self, xml): try: doi = get_value_in_tag(xml, "ArticleDOI") if not doi: raise ValueError("DOI not found") except Exception, err: print >> sys.stderr, "Can't find doi: %s" % err raise #journal, issn, volume, issue, first_page, last_page, year journal = get_value_in_tag(xml, "JournalAbbreviatedTitle") if journal == 'J. High Energ. Phys.': journal = 'JHEP' issn = get_value_in_tag(xml, "JournalAbbreviatedTitle") volume = get_value_in_tag(xml, "VolumeIDStart")[2:] + "%02d" % int(get_value_in_tag(xml, "IssueIDStart")) issue = "" first_page = "%03d" % int(get_value_in_tag(xml, "ArticleSequenceNumber")) pages = get_value_in_tag(xml, "ArticleLastPage") year = get_value_in_tag(xml, "VolumeIDStart") return journal, issn, volume, issue, first_page, pages, year, doi def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("Author"): tmp = {} surname = get_value_in_tag(author, "FamilyName")
def get_abstract(self, xml): try: return get_value_in_tag(xml, "abstract").replace("Abstract", "", 1) except Exception, err: print >> sys.stderr, "Can't find abstract"
def get_copyright(self, xml): try: return get_value_in_tag(xml, "copyright-holder") except Exception, err: print >> sys.stderr, "Can't find copyright"
jid = get_value_in_tag(xml, "journal-id") journal = "" #journal = CFG_ELSEVIER_JID_MAP.get(jid, jid) try: art = xml.getElementsByTagName('article-meta')[0] except IndexError, err: register_exception() print >> sys.stderr, "ERROR: XML corupted: %s" % err pass except Exception, err: register_exception() print >> sys.stderr, "ERROR: Exception captured: %s" % err pass issn = self.get_issn(art) volume = get_value_in_tag(art, "volume") issue = get_value_in_tag(art, "issue") year = self.get_date(art) first_page = get_value_in_tag(art, "fpage") last_page = get_value_in_tag(art, "lpage") doi = self.get_doi(art) return (journal, issn, volume, issue, first_page, last_page, year, doi) def get_doi(self, xml): ids = xml.getElementsByTagName('article-id') ret = "" for i in ids: if i.getAttribute('pub-id-type').encode('utf-8') == 'doi': ret = xml_to_text(i)
def get_article_journal(self, xml): return CFG_ELSEVIER_JID_MAP[get_value_in_tag(xml, "jid")]
def get_copyright(self, xml): try: return get_value_in_tag(xml.getElementsByTagName("ArticleCopyright")[0], "CopyrightHolderName") except Exception, err: print >> sys.stderr, "Can't find copyright. %s" % (err, )
def _get_doi(self, xml): try: return get_value_in_tag(xml, "ce:doi") except Exception, err: print >> sys.stderr, "Can't find doi"
def get_authors(self, xml): authors = [] for author in xml.getElementsByTagName("contrib"): tmp = {} surname = get_value_in_tag(author, "surname") if surname: tmp["surname"] = surname given_name = get_value_in_tag(author, "given-names") if given_name: tmp["given_name"] = given_name.replace('\n', ' ') # It's not there # orcid = author.getAttribute('orcid').encode('utf-8') # if orcid: # tmp["orcid"] = orcid # cross_refs = author.getElementsByTagName("ce:cross-ref") # if cross_refs: # tmp["cross_ref"] = [] # for cross_ref in cross_refs: # tmp["cross_ref"].append(cross_ref.getAttribute("refid").encode('utf-8')) tmp["affiliations_ids"] = [] tmp["contact_ids"] = [] xrefs = author.getElementsByTagName("xref") for x in xrefs: if x.getAttribute('ref-type').encode('utf-8') == 'aff': tmp["affiliations_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()]) if x.getAttribute('ref-type').encode('utf-8') == 'corresp': tmp["contact_ids"].extend([a.encode('utf-8') for a in x.getAttribute('rid').split()]) authors.append(tmp) affiliations = {} for affiliation in xml.getElementsByTagName("aff"): aff_id = affiliation.getAttribute("id").encode('utf-8') # removes numbering in from affiliations text = re.sub(r'^(\d+\ ?)', "", xml_to_text(affiliation)) affiliations[aff_id] = text emails = {} for contact in xml.getElementsByTagName("corresp"): contact_id = contact.getAttribute("id").encode('utf-8') text = xml_to_text(contact.getElementsByTagName('email')[0]) emails[contact_id] = text implicit_affilations = True for author in authors: matching_ref = [ref for ref in author.get("affiliations_ids") if ref in affiliations] if matching_ref: implicit_affilations = False author["affiliation"] = [] for i in xrange(0, len(matching_ref)): author["affiliation"].append(affiliations[matching_ref[i]]) matching_contact = [cont for cont in author.get('contact_ids') if cont in emails] if matching_contact: author["email"] = emails[matching_contact[0]] if implicit_affilations and len(affiliations) > 1: print >> sys.stderr, "Implicit affiliations are used, but there's more than one affiliation: %s" % affiliations if implicit_affilations and len(affiliations) >= 1: for author in authors: author["affiliation"] = [] for aff in affiliations.values(): author["affiliation"].append(aff) return authors
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ref"): plain_text = None ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8') label = get_value_in_tag(reference, "label").strip('.') authors = [] for author in reference.getElementsByTagName("name"): given_name = get_value_in_tag(author, "given-names") surname = get_value_in_tag(author, "surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname if name.strip().split() == []: name = get_value_in_tag(author, "string-name") authors.append(name) doi_tag = reference.getElementsByTagName("pub-id") doi = "" for tag in doi_tag: if tag.getAttribute("pub-id-type") == "doi": doi = xml_to_text(tag) issue = get_value_in_tag(reference, "issue") page = get_value_in_tag(reference, "fpage") page_last = get_value_in_tag(reference, "lpage") title = get_value_in_tag(reference, "source") volume = get_value_in_tag(reference, "volume") year = get_value_in_tag(reference, "year") ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv")) if ref_type != 'journal': plain_text = get_value_in_tag(reference, "mixed-citation") references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text)) self.references = references
def get_keywords(self, xml): try: return [get_value_in_tag(keyword, "ce:text") for keyword in xml.getElementsByTagName("ce:keyword")] except Exception, err: print >> sys.stderr, "Can't find keywords"