def test_fix_journal_name(self): """Test journal name handling.""" self.assertEqual(fix_journal_name("A&A", journal_mappings), ('Astron.Astrophys.', "")) self.assertEqual(fix_journal_name("A&A B", journal_mappings), ('Astron.Astrophys.', "B")) self.assertEqual(fix_journal_name("A&A.B", journal_mappings), ('A&A.', "B")) self.assertEqual(fix_journal_name("A&AB.", journal_mappings), ("A&AB.", ""))
def _get_references(self): for ref in self.document.getElementsByTagName('ref'): label = ref.getAttribute('id') label = sub(r'\D', '', label) text_ref = '' ext_link = '' for mixed in ref.getElementsByTagName('mixed-citation'): ref_type = mixed.getAttribute('publication-type') if ref_type == 'thesis': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'conf-proc': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'other' or ref_type == 'web': text_ref = get_value_in_tag(ref, 'mixed-citation') ext_link = get_value_in_tag(mixed, 'ext-link') elif ref_type == 'book': text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName('string-name'): surname = get_value_in_tag(auth, 'surname') given_names = get_value_in_tag(auth, 'given-names') given_names = collapse_initials(given_names) authors.append('%s, %s' % (surname, given_names)) year = get_value_in_tag(ref, 'year') source = get_value_in_tag(ref, 'source') volume = get_value_in_tag(ref, 'volume') page = get_value_in_tag(ref, 'fpage') if ref_type == 'journal': source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield (label, ref_type, text_ref, ext_link, authors, year, source, volume, page)
def _get_references(self): for ref in self.document.getElementsByTagName("ref"): label = ref.getAttribute("id") label = sub(r"\D", "", label) text_ref = "" ext_link = "" for mixed in ref.getElementsByTagName("mixed-citation"): ref_type = mixed.getAttribute("publication-type") if ref_type == "thesis": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "conf-proc": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "other" or ref_type == "web": text_ref = get_value_in_tag(ref, "mixed-citation") ext_link = get_value_in_tag(mixed, "ext-link") elif ref_type == "book": text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName("string-name"): surname = get_value_in_tag(auth, "surname") given_names = get_value_in_tag(auth, "given-names") given_names = collapse_initials(given_names) authors.append("%s, %s" % (surname, given_names)) year = get_value_in_tag(ref, "year") source = get_value_in_tag(ref, "source") volume = get_value_in_tag(ref, "volume") page = get_value_in_tag(ref, "fpage") if ref_type == "journal": source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page
def _get_publition_information(self): journal = self._get_journal() date = self._get_date() doi = self._get_doi() journal, volume = fix_journal_name(journal, self.journal_mappings) article_id = get_value_in_tag(self.document, 'elocation-id') volume += get_value_in_tag(self.document, 'volume') issue = get_value_in_tag(self.document, 'issue') year = get_value_in_tag(self.document, 'copyright-year') return (journal, volume, issue, year, date, doi, article_id)
def get_publication_information(self, xml_doc, path='', timeout=60): if self.CONSYN: publication = get_value_in_tag(xml_doc, "prism:publicationName") doi = get_value_in_tag(xml_doc, "prism:doi") issn = get_value_in_tag(xml_doc, "prism:issn") issue = get_value_in_tag(xml_doc, "prism:number") first_page = get_value_in_tag(xml_doc, "prism:startingPage") last_page = get_value_in_tag(xml_doc, "prism:endingPage") journal = publication.split(",")[0] journal, volume = fix_journal_name(journal, self.journal_mappings) try: vol = publication.split(",")[1].strip() if vol.startswith("Section"): vol = vol[7:].strip() if vol and not volume: volume = vol except IndexError: pass vol = get_value_in_tag(xml_doc, "prism:volume") if vol is "" and path is not "": # if volume is not present try to harvest it try: session = requests.session() url = 'http://www.sciencedirect.com/science/article/pii'\ + path.split('/')[-1] headers = {'user-agent': make_user_agent()} r = session.get(url, headers=headers, timeout=timeout) parsed_html = BeautifulSoup(r.text) info = parsed_html.body.find('p', attrs={ 'class': 'volIssue' }).text.split() for s in info: if unicode(s).find(u'\xe2') > 0: first_page = s.rsplit(u'\xe2')[0] last_page = s.rsplit(u'\x93')[1] if info[1].lower() != 'online': vol = info[1][:-1] except: pass if vol: volume += vol start_date = self.get_publication_date(xml_doc) year = start_date.split("-")[0] doi = get_value_in_tag(xml_doc, "ce:doi") return (journal, issn, volume, issue, first_page, last_page, year, start_date, doi) else: doi = self._get_doi(xml_doc) try: return self._dois[doi] + (doi, ) except KeyError: return ('', '', '', '', '', '', '', '', doi)
def _get_publication_information(self): journal = self._get_journal() date = self._get_date() doi = self._get_doi() issue = get_value_in_tag(self.document, 'issue') journal, volume = fix_journal_name(journal, self.journal_mappings) volume += get_value_in_tag(self.document, 'volume') page = get_value_in_tag(self.document, 'elocation-id') fpage = get_value_in_tag(self.document, 'fpage') lpage = get_value_in_tag(self.document, 'lpage') year = date[:4] return (journal, volume, issue, year, date, doi, page, fpage, lpage)
def get_publication_information(self, xml_doc, path='', timeout=60): if self.CONSYN: publication = get_value_in_tag(xml_doc, "prism:publicationName") doi = get_value_in_tag(xml_doc, "prism:doi") issn = get_value_in_tag(xml_doc, "prism:issn") issue = get_value_in_tag(xml_doc, "prism:number") first_page = get_value_in_tag(xml_doc, "prism:startingPage") last_page = get_value_in_tag(xml_doc, "prism:endingPage") journal = publication.split(",")[0] journal, volume = fix_journal_name(journal, self.journal_mappings) try: vol = publication.split(",")[1].strip() if vol.startswith("Section"): vol = vol[7:].strip() if vol and not volume: volume = vol except IndexError: pass vol = get_value_in_tag(xml_doc, "prism:volume") if vol is "" and path is not "": # if volume is not present try to harvest it try: session = requests.session() url = 'http://www.sciencedirect.com/science/article/pii'\ + path.split('/')[-1] headers = {'user-agent': make_user_agent()} r = session.get(url, headers=headers, timeout=timeout) parsed_html = BeautifulSoup(r.text) info = parsed_html.body.find( 'p', attrs={'class': 'volIssue'}).text.split() for s in info: if unicode(s).find(u'\xe2') > 0: first_page = s.rsplit(u'\xe2')[0] last_page = s.rsplit(u'\x93')[1] if info[1].lower() != 'online': vol = info[1][:-1] except: pass if vol: volume += vol start_date = self.get_publication_date(xml_doc) year = start_date.split("-")[0] doi = get_value_in_tag(xml_doc, "ce:doi") return (journal, issn, volume, issue, first_page, last_page, year, start_date, doi) else: doi = self._get_doi(xml_doc) try: return self._dois[doi] + (doi, ) except KeyError: return ('', '', '', '', '', '', '', '', doi)
def _add_references(self, xml_doc, rec): if self.CONSYN: for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link, isjournal, comment, journal, publisher,\ editors, book_title in self.get_references(xml_doc): subfields = [] if textref and not authors: textref = textref.replace('\"', '\'') ref_xml = extract_references_from_string_xml(textref) dom = xml.dom.minidom.parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 's': try: journal = data.split(',')[0] journal, vol = fix_journal_name(journal, self.journal_mappings) vol += data.split(',')[1] try: page = data.split(',')[2] journal = journal + "," + vol + "," + page subfields.append(('s', journal)) except IndexError: journal = journal + "," + vol subfields.append(('s', journal)) except IndexError: subfields.append(('s', data)) else: subfields.append((code, data)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if ext_link: subfields.append(('r', ext_link)) if title: subfields.append(('t', title)) elif textref: subfields.append(('m', textref)) if publisher: subfields.append(('p', publisher)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if comment: subfields.append(('m', comment)) for editor in editors: subfields.append(('e', editor)) if book_title: subfields.append(('q', book_title)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if journal: journal, vol = fix_journal_name(journal, self.journal_mappings) volume = vol + volume if volume and page: journal = journal + "," + volume + "," + page subfields.append(('s', journal)) elif volume: journal = journal + "," + volume subfields.append(('s', journal)) else: subfields.append(('s', journal)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link in self.get_references(xml_doc): subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) if ext_link: subfields.append(('r', ext_link)) if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif textref: subfields.append(('m', textref)) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field( rec, '999', ind1='C', ind2='5', subfields=subfields)
def _add_references(self, xml_doc, rec, refextract_callback=None): for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link, isjournal, comment, journal, publisher,\ editors, book_title in self.get_references(xml_doc): subfields = [] if textref and not authors: textref = textref.replace('\"', '\'') if refextract_callback: ref_xml = refextract_callback(textref) dom = xml.dom.minidom.parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'r': data = fix_dashes(data) subfields.append((code, data)) if fields: subfields.append(('9', 'refextract')) else: subfields.append(('m', textref)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if ext_link: ext_link = fix_dashes(ext_link) subfields.append(('r', ext_link)) if title: subfields.append(('t', title)) elif textref: subfields.append(('m', textref)) if publisher: subfields.append(('p', publisher)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if comment: subfields.append(('m', comment)) for editor in editors: subfields.append(('e', editor)) if book_title: subfields.append(('q', book_title)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if journal: journal, vol = fix_journal_name(journal, self.journal_mappings) volume = vol + volume if volume and page: journal = journal + "," + volume + "," + page subfields.append(('s', journal)) elif volume: journal = journal + "," + volume subfields.append(('s', journal)) else: subfields.append(('s', journal)) if textref: subfields.append(('m', textref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field( rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file( ".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field(rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get('given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, ) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file(".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def test_fix_journal_name(self): """Test journal name handling.""" self.assertEqual(fix_journal_name("A&A", journal_mappings), ('Astron.Astrophys.', "")) self.assertEqual(fix_journal_name("A&A B", journal_mappings), ('Astron.Astrophys.', "B")) self.assertEqual(fix_journal_name("A&A.B", journal_mappings), ('Astron.Astrophys.', "B")) self.assertEqual(fix_journal_name("A&AB.", journal_mappings), ("A&AB.", ""))
def _get_reference(self, ref): """Retrieve the data for a reference.""" label = get_value_in_tag(ref, 'label') label = re.sub('\D', '', label) for innerref in ref.getElementsByTagName('mixed-citation'): ref_type = innerref.getAttribute('publication-type') institution = get_value_in_tag(innerref, 'institution') report_no = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'other': if tag.hasChildNodes(): report_no = get_all_text(tag) doi = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'doi': doi = xml_to_text(tag) collaboration = get_value_in_tag(innerref, 'collab') authors = [] person_groups = innerref.getElementsByTagName('person-group') for author_group in person_groups: if author_group.getAttribute('person-group-type') == 'author': for author in author_group.getElementsByTagName( 'string-name'): if author.hasChildNodes(): authors.append(get_all_text(author)) editors = [] for editor_group in person_groups: if editor_group.getAttribute('person-group-type') == 'editor': for editor in editor_group.getElementsByTagName( 'string-name'): if editor.hasChildNodes(): editors.append(get_all_text(editor)) journal = get_value_in_tag(innerref, 'source') journal, volume = fix_journal_name(journal, self.journal_mappings) volume += get_value_in_tag(innerref, 'volume') if journal == 'J.High Energy Phys.' or journal == 'JHEP': issue = get_value_in_tag(innerref, 'issue') volume = volume[2:] + issue journal = 'JHEP' page = get_value_in_tag(innerref, 'page-range') year = get_value_in_tag(innerref, 'year') external_link = get_value_in_tag(innerref, 'ext-link') arxiv = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'arxiv': if tag.hasChildNodes(): arxiv = get_all_text(tag) arxiv = format_arxiv_id(arxiv) publisher = get_value_in_tag(innerref, 'publisher-name') publisher_location = get_value_in_tag(innerref, 'publisher-loc') if publisher_location: publisher = publisher_location + ': ' + publisher unstructured_text = [] for child in innerref.childNodes: if child.nodeType == child.TEXT_NODE: text = child.nodeValue.strip() text = re.sub(r'[\[\]\(\.;\)]', '', text).strip() if text.startswith(','): text = text[1:].strip() if text.endswith('Report No'): text = institution + " " + text institution = '' text = text.strip() elif text.endswith(' ed'): text += '.' elif text.endswith('PhD thesis,'): if institution: text += ' ' + institution institution = '' else: text = text[:-1] elif text.startswith('Seminar,'): article_title = get_value_in_tag( innerref, 'article-title') text = institution + " Seminar, \"" + article_title + "\"" institution = '' elif text == u'\u201d': text = '' ignore_text = ['in', 'pp', 'edited by'] if text.startswith('Vol'): temp = re.sub(r'\D', '', text) if temp: volume += temp elif len(text) > 1 and text not in ignore_text\ and not (text.isdigit() or text[:-1].isdigit()): unstructured_text.append(text) if unstructured_text: unstructured_text = " ".join(unstructured_text) if ref_type == 'book': if volume and not volume.lower().startswith('vol'): volume = 'Vol ' + volume if volume and page: volume = volume + ', pp ' + page yield ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text, external_link,\ report_no, editors
def get_record(self, path=None, no_pdf=False): xml_doc = self.get_article(path) rec = {} title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) journal, dummy, volume, issue, first_page,\ last_page, year, start_date, doi = self.get_publication_information( xml_doc) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date)]) else: record_add_field( rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) copyrightt = self.get_copyright(xml_doc) if copyright: record_add_field(rec, '542', subfields=[('f', copyrightt)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('c', '%s-%s' % ( first_page, last_page)), ('y', year)]) self._add_references(xml_doc, rec) if self.CONSYN: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) if doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) else: if not no_pdf: from invenio.search_engine import search_pattern query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,) prev_version = search_pattern(p=query) from invenio.bibdocfile import BibRecDocs old_pdf = False if prev_version: prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file( ".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = 'Leaving previously delivered PDF/A for: ' + doi self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_publication_information(self, xml_doc): if self.CONSYN: publication = get_value_in_tag(xml_doc, "prism:publicationName") doi = get_value_in_tag(xml_doc, "prism:doi") issn = get_value_in_tag(xml_doc, "prism:issn") issue = get_value_in_tag(xml_doc, "prism:number") first_page = get_value_in_tag(xml_doc, "prism:startingPage") last_page = get_value_in_tag(xml_doc, "prism:endingPage") journal = publication.split(",")[0] journal, volume = fix_journal_name(journal, self.journal_mappings) try: vol = publication.split(",")[1].strip() if vol.startswith("Section"): vol = vol[7:].strip() if vol and not volume: volume = vol except IndexError: pass vol = get_value_in_tag(xml_doc, "prism:volume") if vol is "": # if volume is not present try to harvest it try: session = requests.session() r = session.get("http://dx.doi.org/" + doi) parsed_html = BeautifulSoup(r.text) info = parsed_html.body.find( 'p', attrs={'class': 'volIssue'}).text.split() for s in info: if unicode(s).find(u'\xe2') > 0: first_page = s.rsplit(u'\xe2')[0] last_page = s.rsplit(u'\x93')[1] if info[1].lower() != 'online': vol = info[1][:-1] except: pass if vol: volume += vol year = xml_doc.getElementsByTagName( 'ce:copyright')[0].getAttribute("year") year = year.encode('utf-8') start_date = get_value_in_tag(xml_doc, "prism:coverDate") if len(xml_doc.getElementsByTagName('ce:date-accepted')) > 0: full_date = xml_doc.getElementsByTagName('ce:date-accepted')[0] y = full_date.getAttribute('year').encode('utf-8') m = full_date.getAttribute('month').encode('utf-8').zfill(2) d = full_date.getAttribute('day').encode('utf-8').zfill(2) start_date = "%s-%s-%s" % (y, m, d) elif len(start_date) is 8: start_date = time.strftime( '%Y-%m-%d', time.strptime(start_date, '%Y%m%d')) elif len(start_date) is 6: start_date = time.strftime( '%Y-%m', time.strptime(start_date, '%Y%m')) doi = get_value_in_tag(xml_doc, "ce:doi") return (journal, issn, volume, issue, first_page, last_page, year, start_date, doi) else: doi = self._get_doi(xml_doc) try: return self._dois[doi] + (doi, ) except KeyError: return ('', '', '', '', '', '', '', '', doi)
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation') affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID')[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append( ('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def _get_reference(self, ref): """Retrieve the data for a reference.""" label = get_value_in_tag(ref, 'label') label = re.sub('\D', '', label) for innerref in ref.getElementsByTagName('mixed-citation'): ref_type = innerref.getAttribute('publication-type') institution = get_value_in_tag(innerref, 'institution') report_no = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'other': if tag.hasChildNodes(): report_no = get_all_text(tag) doi = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'doi': doi = xml_to_text(tag) collaboration = get_value_in_tag(innerref, 'collab') authors = [] person_groups = innerref.getElementsByTagName('person-group') for author_group in person_groups: if author_group.getAttribute('person-group-type') == 'author': for author in author_group.getElementsByTagName('string-name'): if author.hasChildNodes(): authors.append(get_all_text(author)) editors = [] for editor_group in person_groups: if editor_group.getAttribute('person-group-type') == 'editor': for editor in editor_group.getElementsByTagName('string-name'): if editor.hasChildNodes(): editors.append(get_all_text(editor)) journal = get_value_in_tag(innerref, 'source') journal, volume = fix_journal_name(journal, self.journal_mappings) volume += get_value_in_tag(innerref, 'volume') if journal == 'J.High Energy Phys.' or journal == 'JHEP': issue = get_value_in_tag(innerref, 'issue') volume = volume[2:] + issue journal = 'JHEP' page = get_value_in_tag(innerref, 'page-range') year = get_value_in_tag(innerref, 'year') external_link = get_value_in_tag(innerref, 'ext-link') arxiv = '' for tag in innerref.getElementsByTagName('pub-id'): if tag.getAttribute('pub-id-type') == 'arxiv': if tag.hasChildNodes(): arxiv = get_all_text(tag) arxiv = format_arxiv_id(arxiv) publisher = get_value_in_tag(innerref, 'publisher-name') publisher_location = get_value_in_tag(innerref, 'publisher-loc') if publisher_location: publisher = publisher_location + ': ' + publisher unstructured_text = [] for child in innerref.childNodes: if child.nodeType == child.TEXT_NODE: text = child.nodeValue.strip() text = re.sub(r'[\[\]\(\.;\)]', '', text).strip() if text.startswith(','): text = text[1:].strip() if text.endswith('Report No'): text = institution + " " + text institution = '' text = text.strip() elif text.endswith(' ed'): text += '.' elif text.endswith('PhD thesis,'): if institution: text += ' ' + institution institution = '' else: text = text[:-1] elif text.startswith('Seminar,'): article_title = get_value_in_tag(innerref, 'article-title') text = institution + " Seminar, \"" + article_title + "\"" institution = '' elif text == u'\u201d': text = '' ignore_text = ['in', 'pp', 'edited by'] if text.startswith('Vol'): temp = re.sub(r'\D', '', text) if temp: volume += temp elif len(text) > 1 and text not in ignore_text\ and not (text.isdigit() or text[:-1].isdigit()): unstructured_text.append(text) if unstructured_text: unstructured_text = " ".join(unstructured_text) if ref_type == 'book': if volume and not volume.lower().startswith('vol'): volume = 'Vol ' + volume if volume and page: volume = volume + ', pp ' + page yield ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text, external_link,\ report_no, editors
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation' ) affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID' )[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings ) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def test_fix_journal_name(self): self.assertEqual(fix_journal_name("A&A", journal_mappings), ('Astron.Astrophys.', "")) self.assertEqual(fix_journal_name("A&A B", journal_mappings), ('Astron.Astrophys.', "B")) self.assertEqual(fix_journal_name("A&A.B", journal_mappings), ('A&A.', "B")) self.assertEqual(fix_journal_name("A&AB.", journal_mappings), ("A&AB.", ""))
def get_record_rich(self, filename): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName("ArticleID") for article in articles: article_type = article.getAttribute("Type") if not article_type == "Article": return "" doi = get_value_in_tag(self.document, "DOI") date = "" for tag in self.document.getElementsByTagName("Accepted"): year = get_value_in_tag(tag, "Year") month = get_value_in_tag(tag, "Month").zfill(2) day = get_value_in_tag(tag, "Day").zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName("OnlineDate"): year = get_value_in_tag(tag, "Year") month = get_value_in_tag(tag, "Month").zfill(2) day = get_value_in_tag(tag, "Day").zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, "FirstPage") last_page = get_value_in_tag(article, "LastPage") subjects = article.getElementsByTagName("Keyword") subjects = map(xml_to_text, subjects) subject = ", ".join(subjects) copyright_statement = get_value_in_tag(article, "Copyright") journal = get_value_in_tag(self.document, "JournalTitle") journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName("IssueID") for issue in issues: volume += get_value_in_tag(issue, "Volume") year = get_value_in_tag(issue, "Year") title = get_value_in_tag(self.document, "Title") authors = self.document.getElementsByTagName("Author") affiliations = self.document.getElementsByTagName("Affiliation") def affiliation_pair(a): return a.getAttribute("ID"), get_value_in_tag(a, "UnstructuredAffiliation") affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, "LastName") first_name = get_value_in_tag(a, "FirstName") middle_name = get_value_in_tag(a, "MiddleName") if middle_name: name = "%s, %s %s" % (surname, first_name, middle_name) else: name = "%s, %s" % (surname, first_name) try: affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label") affiliation = affiliations[affid] except IndexError: affiliation = "" except KeyError: affiliation = "" return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, "Abstract") references = self.document.getElementsByTagName("Bibliomixed") for reference in references: subfields = [] label = reference.getAttribute("N") if label: subfields.append(("o", label)) bibliosets = reference.getElementsByTagName("Biblioset") for tag in bibliosets: ref_year = get_value_in_tag(tag, "Date") ref_journal = get_value_in_tag(tag, "JournalShortTitle") ref_journal, ref_volume = fix_journal_name(ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, "Volume") ref_page = get_value_in_tag(tag, "ArtPageNums") if ref_year: subfields.append(("y", ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(("s", "%s,%s,%s" % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) ref_xml = extract_references_from_string_xml(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(("9", "refextract")) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == "m" and bibliosets: continue else: subfields.append((code, data)) if subfields: record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields) if title: record_add_field(rec, "245", subfields=[("a", title)]) if date: record_add_field(rec, "260", subfields=[("c", date), ("t", "published")]) if doi: record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")]) if abstract: record_add_field(rec, "520", subfields=[("a", abstract), ("9", "EDPSciences")]) first_author = True for author in authors: if first_author: subfields = [("a", author[0])] if author[1]: subfields.append(("v", author[1])) record_add_field(rec, "100", subfields=subfields) first_author = False else: subfields = [("a", author[0])] if author[1]: subfields.append(("v", author[1])) record_add_field(rec, "700", subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(("s", "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, "300", subfields=[("a", str(nuber_of_pages))]) except ValueError: pass subfields.append(("c", "%s-%s" % (first_page, last_page))) if year: subfields.append(("y", year)) record_add_field(rec, "773", subfields=subfields) record_add_field(rec, "980", subfields=[("a", "HEP")]) if copyright_statement: record_add_field(rec, "542", subfields=[("f", copyright_statement)]) if subject: record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "EDPSciences"), ("a", subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""