def get_arxiv_id(self, xml): custom_metas = xml.getElementsByTagName("custom-meta") ext_link = None for meta in custom_metas: if get_value_in_tag(meta, "meta-name") == "arxiv-id": ext_link = format_arxiv_id(get_value_in_tag(meta, "meta-value").encode('utf-8')) return ext_link
def get_references(self, xml_doc): for ref in xml_doc.getElementsByTagName("ce:bib-reference"): label = get_value_in_tag(ref, "ce:label") if self.CONSYN: innerrefs = ref.getElementsByTagName("sb:reference") if not innerrefs: yield self._get_ref(ref, label) for inner in innerrefs: yield self._get_ref(inner, label) else: authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) doi = get_value_in_tag(ref, "ce:doi") issue = get_value_in_tag(ref, "sb:issue") page = get_value_in_tag(ref, "sb:first-page") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date")[:4] else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) yield (label, authors, doi, issue, page, title, volume, year, textref, ext_link)
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ref"): plain_text = None ref_type = reference.getElementsByTagName('citation')[0].getAttribute('publication-type').encode('utf-8') label = get_value_in_tag(reference, "label").strip('.') authors = [] for author in reference.getElementsByTagName("name"): given_name = get_value_in_tag(author, "given-names") surname = get_value_in_tag(author, "surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname if name.strip().split() == []: name = get_value_in_tag(author, "string-name") authors.append(name) doi_tag = reference.getElementsByTagName("pub-id") doi = "" for tag in doi_tag: if tag.getAttribute("pub-id-type") == "doi": doi = xml_to_text(tag) issue = get_value_in_tag(reference, "issue") page = get_value_in_tag(reference, "fpage") page_last = get_value_in_tag(reference, "lpage") title = get_value_in_tag(reference, "source") volume = get_value_in_tag(reference, "volume") year = get_value_in_tag(reference, "year") ext_link = format_arxiv_id(super(NLMParser, self).get_ref_link(reference, "arxiv")) if ref_type != 'journal': plain_text = get_value_in_tag(reference, "mixed-citation") references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text)) self.references = references
def test_format_arxiv_id(self): """Test arXiv formatting.""" self.assertEqual(format_arxiv_id("arXiv:1312.1300"), "arXiv:1312.1300") self.assertEqual(format_arxiv_id("1312.1300"), "arXiv:1312.1300") self.assertEqual(format_arxiv_id("1312.13005"), "arXiv:1312.13005") self.assertEqual(format_arxiv_id("arxiv:hep/1312002"), "hep/1312002") self.assertEqual(format_arxiv_id("hep/1312002"), "hep/1312002") self.assertEqual(format_arxiv_id("arXiv:1234.12345"), "arXiv:1234.12345")
def get_references(self, xml): references = [] for reference in xml.getElementsByTagName("ref"): plain_text = None try: ref_type = reference.getElementsByTagName('mixed-citation')[0] ref_type = ref_type.getAttribute('publication-type').encode('utf-8') except: ref_type = reference.getElementsByTagName('citation')[0] ref_type = ref_type.getAttribute('publication-type').encode('utf-8') label = get_value_in_tag(reference, "label").strip('.') authors = [] for author in reference.getElementsByTagName("name"): given_name = get_value_in_tag(author, "given-names") surname = get_value_in_tag(author, "surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname if name.strip().split() == []: name = get_value_in_tag(author, "string-name") authors.append(name) doi_tag = reference.getElementsByTagName("pub-id") doi = "" for tag in doi_tag: if tag.getAttribute("pub-id-type") == "doi": doi = xml_to_text(tag) issue = get_value_in_tag(reference, "issue") page = get_value_in_tag(reference, "fpage") page_last = get_value_in_tag(reference, "lpage") title = get_value_in_tag(reference, "source") volume = get_value_in_tag(reference, "volume") year = get_value_in_tag(reference, "year") ext_link = format_arxiv_id(self.get_ref_link(reference, "arxiv")) if ref_type != 'journal': try: plain_text = get_value_in_tag(reference, "mixed-citation", tag_to_remove=self.tag_to_remove) except: plain_text = get_value_in_tag(reference, "citation", tag_to_remove=self.tag_to_remove) references.append((label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text)) self.references = references
def test_format_arxiv_id(self): self.assertEqual(format_arxiv_id("arXiv:1312.1300"), "arXiv:1312.1300") self.assertEqual(format_arxiv_id("1312.1300"), "arXiv:1312.1300") self.assertEqual(format_arxiv_id("arxiv:hep/1312/1300", True), "hep/1312/1300") self.assertEqual(format_arxiv_id("arxiv:hep/1312/1300"), "arxiv:hep/1312/1300")
def _get_reference(self, ref): """Retrieve the data for a reference.""" label = get_value_in_tag(ref, 'label') label = re.sub('\D', '', label) for innerref in ref.getElementsByTagName('mixed-citation'): ref_type = innerref.getAttribute('publication-type') institution = get_value_in_tag(innerref, 'institution') report_no = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'other': if tag.hasChildNodes(): report_no = get_all_text(tag) doi = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'doi': doi = xml_to_text(tag) collaboration = get_value_in_tag(innerref, 'collab') authors = [] person_groups = innerref.getElementsByTagName('person-group') for author_group in person_groups: if author_group.getAttribute('person-group-type') == 'author': for author in author_group.getElementsByTagName('string-name'): if author.hasChildNodes(): authors.append(get_all_text(author)) editors = [] for editor_group in person_groups: if editor_group.getAttribute('person-group-type') == 'editor': for editor in editor_group.getElementsByTagName('string-name'): if editor.hasChildNodes(): editors.append(get_all_text(editor)) journal = get_value_in_tag(innerref, 'source') journal, volume = fix_journal_name(journal, self.journal_mappings) volume += get_value_in_tag(innerref, 'volume') if journal == 'J.High Energy Phys.' or journal == 'JHEP': issue = get_value_in_tag(innerref, 'issue') volume = volume[2:] + issue journal = 'JHEP' page = get_value_in_tag(innerref, 'page-range') year = get_value_in_tag(innerref, 'year') external_link = get_value_in_tag(innerref, 'ext-link') arxiv = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'arxiv': if tag.hasChildNodes(): arxiv = get_all_text(tag) arxiv = format_arxiv_id(arxiv) publisher = get_value_in_tag(innerref, 'publisher-name') publisher_location = get_value_in_tag(innerref, 'publisher-loc') if publisher_location: publisher = publisher_location + ': ' + publisher unstructured_text = [] for child in innerref.childNodes: if child.nodeType == child.TEXT_NODE: text = child.nodeValue.strip() text = re.sub(r'[\[\]\(\.;\)]', '', text).strip() if text.startswith(','): text = text[1:].strip() if text.endswith('Report No'): text = institution + " " + text institution = '' text = text.strip() elif text.endswith(' ed'): text += '.' elif text.endswith('PhD thesis,'): if institution: text += ' ' + institution institution = '' else: text = text[:-1] elif text.startswith('Seminar,'): article_title = get_value_in_tag(innerref, 'article-title') text = institution + " Seminar, \"" + article_title + "\"" institution = '' elif text == u'\u201d': text = '' ignore_text = ['in', 'pp', 'edited by'] if text.startswith('Vol'): temp = re.sub(r'\D', '', text) if temp: volume += temp elif len(text) > 1 and text not in ignore_text\ and not (text.isdigit() or text[:-1].isdigit()): unstructured_text.append(text) if unstructured_text: unstructured_text = " ".join(unstructured_text) if ref_type == 'book': if volume and not volume.lower().startswith('vol'): volume = 'Vol ' + volume if volume and page: volume = volume + ', pp ' + page yield ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text, external_link,\ report_no, editors
def _get_ref(self, ref, label): doi = get_value_in_tag(ref, "ce:doi") page = get_value_in_tag(ref, "sb:first-page") issue = get_value_in_tag(ref, "sb:issue") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date") else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) if ext_link and ext_link.lower().startswith('arxiv'): # check if the identifier contains # digits seperated by dot regex = r'\d*\.\d*' if not re.search(regex, ext_link): ext_link = ext_link[6:] comment = get_value_in_tag(ref, "sb:comment") links = [] for link in ref.getElementsByTagName("ce:inter-ref"): links.append(xml_to_text(link)) title = "" try: container = ref.getElementsByTagName("sb:contribution")[0] title = container.getElementsByTagName("sb:maintitle")[0] title = xml_to_text(title) except IndexError: title = '' except TypeError: title = '' isjournal = ref.getElementsByTagName("sb:issue") journal = "" if isjournal: isjournal = True if not page: page = comment container = ref.getElementsByTagName("sb:issue")[0] journal = get_value_in_tag(container, "sb:maintitle") edited_book = ref.getElementsByTagName("sb:edited-book") editors = [] book_title = "" publisher = "" if edited_book: # treat as a journal if ref.getElementsByTagName("sb:book-series"): container = ref.getElementsByTagName("sb:book-series")[0] journal = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(ref, "sb:date") isjournal = True # conference elif ref.getElementsByTagName("sb:conference"): container = ref.getElementsByTagName("sb:edited-book")[0] maintitle = get_value_in_tag(container, "sb:maintitle") conference = get_value_in_tag( container, "sb:conference") date = get_value_in_tag(container, "sb:date") # use this variable in order to get in the 'm' field publisher = maintitle + ", " + conference + ", " + date else: container = ref.getElementsByTagName( "sb:edited-book")[0] if ref.getElementsByTagName("sb:editors"): for editor in ref.getElementsByTagName("sb:editor"): surname = get_value_in_tag(editor, "ce:surname") firstname = get_value_in_tag(editor, "ce:given-name") editors.append("%s,%s" % (surname, firstname)) if title: book_title = get_value_in_tag( container, "sb:maintitle") else: title = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(container, "sb:date") if ref.getElementsByTagName("sb:publisher"): container = ref.getElementsByTagName("sb:publisher")[0] location = get_value_in_tag(container, "sb:location") publisher = get_value_in_tag(container, "sb:name") if location: publisher = location + ": " + publisher if ref.getElementsByTagName("sb:book"): if ref.getElementsByTagName("sb:book-series"): book_series = ref.getElementsByTagName( "sb:book-series")[0] title += ", " + \ get_value_in_tag(book_series, "sb:maintitle") title += ", " + \ get_value_in_tag(book_series, "sb:volume-nr") publisher = get_value_in_tag(ref, "sb:publisher") if not year: year = get_value_in_tag(ref, "sb:date") year = re.sub(r'\D', '', year) return (label, authors, doi, issue, page, title, volume, year, textref, ext_link, isjournal, comment, journal, publisher, editors, book_title)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,)) logger.warning("Record %s doesn't contain PDF file." % (doi,)) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,)) logger.warning("Record %s doesn't contain PDF/A file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def _get_ref(self, ref, label): doi = get_value_in_tag(ref, "ce:doi") page = get_value_in_tag(ref, "sb:first-page") if not page: page = get_value_in_tag(ref, "sb:article-number") issue = get_value_in_tag(ref, "sb:issue") title = get_value_in_tag(ref, "sb:maintitle") volume = get_value_in_tag(ref, "sb:volume-nr") tmp_issues = ref.getElementsByTagName('sb:issue') if tmp_issues: year = get_value_in_tag(tmp_issues[0], "sb:date") else: year = '' textref = ref.getElementsByTagName("ce:textref") if textref: textref = xml_to_text(textref[0]) ext_link = format_arxiv_id(self.get_ref_link(ref, 'arxiv')) authors = [] for author in ref.getElementsByTagName("sb:author"): given_name = get_value_in_tag(author, "ce:given-name") surname = get_value_in_tag(author, "ce:surname") if given_name: name = "%s, %s" % (surname, given_name) else: name = surname authors.append(name) if ext_link and ext_link.lower().startswith('arxiv'): # check if the identifier contains # digits seperated by dot regex = r'\d*\.\d*' if not re.search(regex, ext_link): ext_link = ext_link[6:] comment = get_value_in_tag(ref, "sb:comment") links = [] for link in ref.getElementsByTagName("ce:inter-ref"): links.append(xml_to_text(link)) title = "" try: container = ref.getElementsByTagName("sb:contribution")[0] title = container.getElementsByTagName("sb:maintitle")[0] title = xml_to_text(title) except IndexError: title = '' except TypeError: title = '' isjournal = ref.getElementsByTagName("sb:issue") journal = "" if isjournal: isjournal = True if not page: page = comment container = ref.getElementsByTagName("sb:issue")[0] journal = get_value_in_tag(container, "sb:maintitle") edited_book = ref.getElementsByTagName("sb:edited-book") editors = [] book_title = "" publisher = "" if edited_book: # treat as a journal if ref.getElementsByTagName("sb:book-series"): container = ref.getElementsByTagName("sb:book-series")[0] journal = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(ref, "sb:date") isjournal = True # conference elif ref.getElementsByTagName("sb:conference"): container = ref.getElementsByTagName("sb:edited-book")[0] maintitle = get_value_in_tag(container, "sb:maintitle") conference = get_value_in_tag(container, "sb:conference") date = get_value_in_tag(container, "sb:date") # use this variable in order to get in the 'm' field publisher = maintitle + ", " + conference + ", " + date else: container = ref.getElementsByTagName("sb:edited-book")[0] if ref.getElementsByTagName("sb:editors"): for editor in ref.getElementsByTagName("sb:editor"): surname = get_value_in_tag(editor, "ce:surname") firstname = get_value_in_tag(editor, "ce:given-name") editors.append("%s,%s" % (surname, firstname)) if title: book_title = get_value_in_tag(container, "sb:maintitle") else: title = get_value_in_tag(container, "sb:maintitle") year = get_value_in_tag(container, "sb:date") if ref.getElementsByTagName("sb:publisher"): container = ref.getElementsByTagName("sb:publisher")[0] location = get_value_in_tag(container, "sb:location") publisher = get_value_in_tag(container, "sb:name") if location: publisher = location + ": " + publisher if ref.getElementsByTagName("sb:book"): if ref.getElementsByTagName("sb:book-series"): book_series = ref.getElementsByTagName("sb:book-series")[0] title += ", " + \ get_value_in_tag(book_series, "sb:maintitle") title += ", " + \ get_value_in_tag(book_series, "sb:volume-nr") publisher = get_value_in_tag(ref, "sb:publisher") if not year: year = get_value_in_tag(ref, "sb:date") year = re.sub(r'\D', '', year) return (label, authors, doi, issue, page, title, volume, year, textref, ext_link, isjournal, comment, journal, publisher, editors, book_title)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[ ('c', super(NLMParser, self).get_publication_date(xml, logger)) ]) journal, issn, volume, issue, first_page, last_page, year, doi = super( NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[ ('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/') ]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append( ('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi, )) logger.warning("Record %s doesn't contain PDF file." % (doi, )) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception( alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi, )) logger.warning("Record %s doesn't contain PDF/A file." % (doi, )) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def _get_reference(self, ref): """Retrieve the data for a reference.""" label = get_value_in_tag(ref, 'label') label = re.sub('\D', '', label) for innerref in ref.getElementsByTagName('mixed-citation'): ref_type = innerref.getAttribute('publication-type') institution = get_value_in_tag(innerref, 'institution') report_no = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'other': if tag.hasChildNodes(): report_no = get_all_text(tag) doi = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'doi': doi = xml_to_text(tag) collaboration = get_value_in_tag(innerref, 'collab') authors = [] person_groups = innerref.getElementsByTagName('person-group') for author_group in person_groups: if author_group.getAttribute('person-group-type') == 'author': for author in author_group.getElementsByTagName( 'string-name'): if author.hasChildNodes(): authors.append(get_all_text(author)) editors = [] for editor_group in person_groups: if editor_group.getAttribute('person-group-type') == 'editor': for editor in editor_group.getElementsByTagName( 'string-name'): if editor.hasChildNodes(): editors.append(get_all_text(editor)) journal = get_value_in_tag(innerref, 'source') journal, volume = fix_journal_name(journal, self.journal_mappings) volume += get_value_in_tag(innerref, 'volume') if journal == 'J.High Energy Phys.' or journal == 'JHEP': issue = get_value_in_tag(innerref, 'issue') volume = volume[2:] + issue journal = 'JHEP' page = get_value_in_tag(innerref, 'page-range') year = get_value_in_tag(innerref, 'year') external_link = get_value_in_tag(innerref, 'ext-link') arxiv = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'arxiv': if tag.hasChildNodes(): arxiv = get_all_text(tag) arxiv = format_arxiv_id(arxiv) publisher = get_value_in_tag(innerref, 'publisher-name') publisher_location = get_value_in_tag(innerref, 'publisher-loc') if publisher_location: publisher = publisher_location + ': ' + publisher unstructured_text = [] for child in innerref.childNodes: if child.nodeType == child.TEXT_NODE: text = child.nodeValue.strip() text = re.sub(r'[\[\]\(\.;\)]', '', text).strip() if text.startswith(','): text = text[1:].strip() if text.endswith('Report No'): text = institution + " " + text institution = '' text = text.strip() elif text.endswith(' ed'): text += '.' elif text.endswith('PhD thesis,'): if institution: text += ' ' + institution institution = '' else: text = text[:-1] elif text.startswith('Seminar,'): article_title = get_value_in_tag( innerref, 'article-title') text = institution + " Seminar, \"" + article_title + "\"" institution = '' elif text == u'\u201d': text = '' ignore_text = ['in', 'pp', 'edited by'] if text.startswith('Vol'): temp = re.sub(r'\D', '', text) if temp: volume += temp elif len(text) > 1 and text not in ignore_text\ and not (text.isdigit() or text[:-1].isdigit()): unstructured_text.append(text) if unstructured_text: unstructured_text = " ".join(unstructured_text) if ref_type == 'book': if volume and not volume.lower().startswith('vol'): volume = 'Vol ' + volume if volume and page: volume = volume + ', pp ' + page yield ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text, external_link,\ report_no, editors