def get_record(self, record): """ Reads a dom xml element in oaidc format and returns the bibrecord object """ self.document = record rec = create_record() language = self._get_language() if language and language != 'en': record_add_field(rec, '041', subfields=[('a', language)]) publisher = self._get_publisher() date = self._get_date() if publisher and date: record_add_field(rec, '260', subfields=[('b', publisher), ('c', date)]) elif publisher: record_add_field(rec, '260', subfields=[('b', publisher)]) elif date: record_add_field(rec, '260', subfields=[('c', date)]) title = self._get_title() if title: record_add_field(rec, '245', subfields=[('a', title)]) record_copyright = self._get_copyright() if record_copyright: record_add_field(rec, '540', subfields=[('a', record_copyright)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('a', subject), ('2', 'PoS')]) authors = self._get_authors() first_author = True for author in authors: subfields = [('a', author[0])] for affiliation in author[1]: subfields.append(('v', affiliation)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) identifier = self.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] record_add_field(rec, '773', subfields=[('p', 'PoS'), ('v', conference.replace(' ', '')), ('c', contribution), ('y', date[:4])]) record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) return rec
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi, )) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi, ) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi, ) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())]) return record_xml_output(rec)
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,)) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi,) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi,) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())] ) return record_xml_output(rec)
def get_record(self, filename, ref_extract_callback=None): """Get the MARCXML of the files in xaml_jp directory. :param filename: the name of the file to parse. :type filename: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(filename) article_type = self._get_article_type() if article_type not in [ 'research-article', 'corrected-article', 'original-article', 'introduction', 'letter', 'correction', 'addendum', 'review-article', 'rapid-communications' ]: return "" rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: title = fix_title_capitalization(title) subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: abstract = convert_html_subscripts_to_latex(abstract) record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'World Scientific')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) if article_type == 'correction': subfields.append(('m', 'Erratum')) elif article_type == 'addendum': subfields.append(('m', 'Addendum')) record_add_field(rec, '773', subfields=subfields) collections = self.get_collection(journal) for collection in collections: record_add_field(rec, '980', subfields=[collection]) self._add_authors(rec) if article_type in ['correction', 'addendum']: related_article = self._get_related_article() if related_article: record_add_field(rec, '024', ind1='7', subfields=[('a', related_article), ('2', 'DOI')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field(rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get('given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, ) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file(".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, filename, ref_extract_callback=None): """Get the MARCXML of the files in xaml_jp directory. :param filename: the name of the file to parse. :type filename: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(filename) article_type = self._get_article_type() if article_type not in ['research-article', 'corrected-article', 'original-article', 'introduction', 'letter', 'correction', 'addendum', 'review-article', 'rapid-communications']: return "" rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: title = fix_title_capitalization(title) subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: abstract = convert_html_subscripts_to_latex(abstract) record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'World Scientific')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) if article_type == 'correction': subfields.append(('m', 'Erratum')) elif article_type == 'addendum': subfields.append(('m', 'Addendum')) record_add_field(rec, '773', subfields=subfields) collections = self.get_collection(journal) for collection in collections: record_add_field(rec, '980', subfields=[collection]) self._add_authors(rec) if article_type in ['correction', 'addendum']: related_article = self._get_related_article() if related_article: record_add_field(rec, '024', ind1='7', subfields=[('a', related_article), ('2', 'DOI')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field( rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file( ".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, f_path, publisher=None, collection=None, logger=None): #path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, '260', subfields=[('c', publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')]) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) record_add_field(rec, "300", subfields=[('a', pages)]) subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('c', first_page), ('y', year)]) record_add_field(rec, '773', subfields=subfields) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) folder_name = join('/', *(f_path.split('/')[0:-1])) pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf' pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')]) record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)]) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[ ('c', super(NLMParser, self).get_publication_date(xml, logger)) ]) journal, issn, volume, issue, first_page, last_page, year, doi = super( NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[ ('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/') ]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append( ('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi, )) logger.warning("Record %s doesn't contain PDF file." % (doi, )) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception( alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi, )) logger.warning("Record %s doesn't contain PDF/A file." % (doi, )) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,)) logger.warning("Record %s doesn't contain PDF file." % (doi,)) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,)) logger.warning("Record %s doesn't contain PDF/A file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record(self, xml_file): """ Reads a xml file in JATS format and returns a xml string in marc format """ self.document = parse(xml_file) if get_value_in_tag(self.document, "meta"): raise ApsPackageXMLError("The XML format of %s is not correct" % (xml_file, )) page_count = self._get_page_count() rec = create_record() if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) pacscodes = self._get_pacscodes() for pacscode in pacscodes: record_add_field(rec, '084', subfields=[('2', 'PACS'), ('a', pacscode)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'APS'), ('a', subject)]) keywords = self._get_keywords() if keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', ', '.join(keywords)), ('9', 'author')]) title, subtitle, _ = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) journal, volume, issue, year, start_date, doi,\ article_id, _, _ = self._get_publication_information() if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'APS')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) c_holder, c_year, c_statement = self._get_copyright() c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('y', year), ('c', article_id)]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_authors(rec) self._add_references(rec) try: return record_xml_output(rec) except UnicodeDecodeError: sys.stderr.write("""Found a bad char in the file for the article """ + doi) return ""
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation' ) affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID' )[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings ) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, fileName, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_jp directory :param fileName: the name of the file to parse. :type fileName: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(fileName) article_type = self._get_article_type() if article_type not in ['research-article', 'introduction', 'letter']: return '' rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) subjects = self.document.getElementsByTagName('kwd') subjects = map(xml_to_text, subjects) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) for subject in subjects: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.'] if journal in astronomy_journals: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'INSPIRE'), ('a', 'Astrophysics')]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() abstract = self._format_abstract(abstract) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) conference = '' for tag in self.document.getElementsByTagName('conference'): conference = xml_to_text(tag) if conference: record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '500', subfields=[('a', conference)]) self._add_references(rec, ref_extract_callback) self._add_authors(rec) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml) if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) authors = self.get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) page_count = self.get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = self.get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) if keywords['other']: for keyword in keywords['other']: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf') try: open(pdf_path) record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) except: register_exception(alert_admin=True) logger.error("No PDF for paper: %s" % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation') affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID')[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append( ('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, xml_file): """ Reads a xml file in JATS format and returns a xml string in marc format """ self.document = parse(xml_file) if get_value_in_tag(self.document, "meta"): raise ApsPackageXMLError("The XML format of %s is not correct" % (xml_file,)) page_count = self._get_page_count() rec = create_record() if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) pacscodes = self._get_pacscodes() for pacscode in pacscodes: record_add_field(rec, '084', subfields=[('2', 'PACS'), ('a', pacscode)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'APS'), ('a', subject)]) keywords = self._get_keywords() if keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', ', '.join(keywords)), ('9', 'author')]) title, subtitle, _ = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) journal, volume, issue, year, start_date, doi,\ article_id, _, _ = self._get_publication_information() if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'APS')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) c_holder, c_year, c_statement = self._get_copyright() c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('y', year), ('c', article_id)]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_authors(rec) self._add_references(rec) try: return record_xml_output(rec) except UnicodeDecodeError: sys.stderr.write("""Found a bad char in the file for the article """ + doi) return ""