def _attach_fulltext(self, rec, doi): """Attach fulltext FFT.""" url = os.path.join(self.url_prefix, doi) record_add_field(rec, 'FFT', subfields=[('a', url), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')])
def test_no_journal_substring(self): """Test for correct journal conversion without substring matching.""" from harvestingkit.bibrecord import record_add_field, record_get_field_values from harvestingkit.inspire_cds_package.from_inspire import Inspire2CDS rec = {} record_add_field(rec, tag='773', subfields=[('p', 'J. Phys.')]) record_add_field(rec, tag='773', subfields=[('p', 'J.Phys.')]) r = Inspire2CDS(rec) r.update_journals() self.assertNotEqual(record_get_field_values(rec, '773', code='p'), ['J. Phys.', 'Czechoslov. J. Phys.']) self.assertEqual(record_get_field_values(rec, '773', code='p'), ['J. Phys.', 'J. Phys.'])
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi, )) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi, ) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi, ) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())]) return record_xml_output(rec)
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,)) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi,) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi,) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())] ) return record_xml_output(rec)
def _attach_fulltext(self, rec, doi): url = 'http://dx.doi.org/' + doi page = requests.get(url) #url after redirect url = page.url page = page.text parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) page = BeautifulSoup(page) try: if 'epjconf' in doi: div = page.body.find('div', attrs={'id': 'header'}) else: div = page.body.find('div', attrs={ 'class': 'module_background files' }) links = div.findAll('a') except AttributeError: return for pdf in links: if pdf['href'].endswith('pdf'): link_to_pdf = domain + pdf['href'] record_add_field(rec, '856', ind1='4', subfields=[('u', link_to_pdf), ('y', 'EDP Sciences server')]) out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts") try: makedirs(out_folder) filename = join(out_folder, link_to_pdf.split('/')[-1]) except (IOError, OSError): # Problem creating folder filename = None filename = download_file(from_url=link_to_pdf, to_filename=filename, retry_count=5) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')])
def _attach_fulltext(self, rec, doi): url = 'http://dx.doi.org/' + doi page = requests.get(url) #url after redirect url = page.url page = page.text parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) page = BeautifulSoup(page) try: if 'epjconf' in doi: div = page.body.find('div', attrs={'id': 'header'}) else: div = page.body.find( 'div', attrs={'class': 'module_background files'}) links = div.findAll('a') except AttributeError: return for pdf in links: if pdf['href'].endswith('pdf'): link_to_pdf = domain + pdf['href'] record_add_field(rec, '856', ind1='4', subfields=[('u', link_to_pdf), ('y', 'EDP Sciences server')]) out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts") try: makedirs(out_folder) filename = join(out_folder, link_to_pdf.split('/')[-1]) except (IOError, OSError): # Problem creating folder filename = None filename = download_file(from_url=link_to_pdf, to_filename=filename, retry_count=5) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')])
def _add_authors(self, rec): authors = self._get_authors() affiliations = self._get_affiliations() author_emails = self._get_author_emails() first_author = True for author in authors: subfields = [('a', author[0])] if author[1]: for aff in author[1]: subfields.append(('v', affiliations[aff])) if author[2]: for note in author[2]: for email in author_emails.get(note, []): if email: subfields.append(('m', email)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields)
def _add_authors(self, rec): authors = self._get_authors() affiliations = self._get_affiliations() author_emails = self._get_author_emails() first_author = True for author in authors: subfields = [('a', author[0])] if author[1]: for aff in author[1]: subfields.append(('v', affiliations[aff])) if author[2]: for note in author[2]: for email in author_emails[note]: if email: subfields.append(('m', email)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields)
def _add_references(self, rec, ref_extract_callback=None): for label, ref_type, text_ref, ext_link, authors, year, \ source, volume, page in self._get_references(): subfields = [] if label: subfields.append(('o', label)) if text_ref: if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") subfields.append((code, data)) if fields: subfields.append(('9', 'refextract')) else: subfields.append(('m', text_ref)) if ref_type: subfields.append(('d', ref_type)) if ext_link: subfields.append(('u', ext_link)) for author in authors: subfields.append(('h', author)) if year: subfields.append(('y', year)) if source and volume and page: subfields.append(('s', source + "," + volume + "," + page)) elif source and volume: subfields.append(('s', source + "," + volume)) elif source and page: subfields.append(('s', source + "," + page)) elif source: subfields.append(('s', source)) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def _add_authors(self, rec): authors = self._get_authors() first_author = True collaboration_added = False for author in authors: subfields = [('a', author[0])] if author[1]: for aff in author[1]: subfields.append(('v', aff)) if author[2]: for email in author[2]: subfields.append(('m', email)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if author[3] and not collaboration_added: collaborations = [] for collab in author[3]: collab_stripped = collab.replace("for the", "").strip() if collab_stripped not in collaborations: collaborations.append(collab_stripped) record_add_field(rec, '710', subfields=[("g", collab_stripped)]) collaboration_added = True
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field(rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get('given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, ) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file(".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def _add_references(self, rec): """ Adds the reference to the record """ for ref in self.document.getElementsByTagName('ref'): for ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text,\ external_link, report_no, editors in self._get_reference(ref): subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) for editor in editors: subfields.append(('e', editor)) if year: subfields.append(('y', year)) if unstructured_text: if page: subfields.append( ('m', unstructured_text + ', ' + page)) else: subfields.append(('m', unstructured_text)) if collaboration: subfields.append(('c', collaboration)) if institution: subfields.append(('m', institution)) if publisher: subfields.append(('p', publisher)) if arxiv: subfields.append(('r', arxiv)) if report_no: subfields.append(('r', report_no)) if external_link: subfields.append(('u', external_link)) if label: subfields.append(('o', label)) if ref_type == 'book': if journal: subfields.append(('t', journal)) if volume: subfields.append(('m', volume)) elif page and not unstructured_text: subfields.append(('m', page)) else: if volume and page: subfields.append( ('s', journal + "," + volume + "," + page)) elif journal: subfields.append(('t', journal)) if ref_type: subfields.append(('d', ref_type)) if not subfields: #misc-type references try: r = ref.getElementsByTagName('mixed-citation')[0] text = xml_to_text(r) label = text.split()[0] text = " ".join(text.split()[1:]) subfields.append(('s', text)) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) except IndexError: #references without 'mixed-citation' tag try: r = ref.getElementsByTagName('note')[0] subfields.append(('s', xml_to_text(r))) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) except IndexError: #references without 'note' tag subfields.append(('s', xml_to_text(ref))) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def get_record(self, fileName, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_jp directory :param fileName: the name of the file to parse. :type fileName: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(fileName) article_type = self._get_article_type() if article_type not in ['research-article', 'introduction', 'letter']: return '' rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) subjects = self.document.getElementsByTagName('kwd') subjects = map(xml_to_text, subjects) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) for subject in subjects: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.'] if journal in astronomy_journals: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'INSPIRE'), ('a', 'Astrophysics')]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() abstract = self._format_abstract(abstract) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) conference = '' for tag in self.document.getElementsByTagName('conference'): conference = xml_to_text(tag) if conference: record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '500', subfields=[('a', conference)]) self._add_references(rec, ref_extract_callback) self._add_authors(rec) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field( rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file( ".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def _add_references(self, xml_doc, rec, refextract_callback=None): for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link, isjournal, comment, journal, publisher,\ editors, book_title in self.get_references(xml_doc): subfields = [] if textref and not authors: textref = textref.replace('\"', '\'') if refextract_callback: ref_xml = refextract_callback(textref) dom = xml.dom.minidom.parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'r': data = fix_dashes(data) subfields.append((code, data)) if fields: subfields.append(('9', 'refextract')) else: subfields.append(('m', textref)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if ext_link: ext_link = fix_dashes(ext_link) subfields.append(('r', ext_link)) if title: subfields.append(('t', title)) elif textref: subfields.append(('m', textref)) if publisher: subfields.append(('p', publisher)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if comment: subfields.append(('m', comment)) for editor in editors: subfields.append(('e', editor)) if book_title: subfields.append(('q', book_title)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if journal: journal, vol = fix_journal_name(journal, self.journal_mappings) volume = vol + volume if volume and page: journal = journal + "," + volume + "," + page subfields.append(('s', journal)) elif volume: journal = journal + "," + volume subfields.append(('s', journal)) else: subfields.append(('s', journal)) if textref: subfields.append(('m', textref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation' ) affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID' )[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings ) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, record): """ Reads a dom xml element in oaidc format and returns the bibrecord object """ self.document = record rec = create_record() language = self._get_language() if language and language != 'en': record_add_field(rec, '041', subfields=[('a', language)]) publisher = self._get_publisher() date = self._get_date() if publisher and date: record_add_field(rec, '260', subfields=[('b', publisher), ('c', date)]) elif publisher: record_add_field(rec, '260', subfields=[('b', publisher)]) elif date: record_add_field(rec, '260', subfields=[('c', date)]) title = self._get_title() if title: record_add_field(rec, '245', subfields=[('a', title)]) record_copyright = self._get_copyright() if record_copyright: record_add_field(rec, '540', subfields=[('a', record_copyright)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('a', subject), ('2', 'PoS')]) authors = self._get_authors() first_author = True for author in authors: subfields = [('a', author[0])] for affiliation in author[1]: subfields.append(('v', affiliation)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) identifier = self.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] record_add_field(rec, '773', subfields=[('p', 'PoS'), ('v', conference.replace(' ', '')), ('c', contribution), ('y', date[:4])]) record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) return rec
def _add_references(self, rec): """ Adds the reference to the record """ for ref in self.document.getElementsByTagName('ref'): for ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text,\ external_link, report_no, editors in self._get_reference(ref): subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) for editor in editors: subfields.append(('e', editor)) if year: subfields.append(('y', year)) if unstructured_text: if page: subfields.append(('m', unstructured_text + ', ' + page)) else: subfields.append(('m', unstructured_text)) if collaboration: subfields.append(('c', collaboration)) if institution: subfields.append(('m', institution)) if publisher: subfields.append(('p', publisher)) if arxiv: subfields.append(('r', arxiv)) if report_no: subfields.append(('r', report_no)) if external_link: subfields.append(('u', external_link)) if label: subfields.append(('o', label)) if ref_type == 'book': if journal: subfields.append(('t', journal)) if volume: subfields.append(('m', volume)) elif page and not unstructured_text: subfields.append(('m', page)) else: if volume and page: subfields.append(('s', journal + "," + volume + "," + page)) elif journal: subfields.append(('t', journal)) if ref_type: subfields.append(('d', ref_type)) if not subfields: #misc-type references try: r = ref.getElementsByTagName('mixed-citation')[0] text = xml_to_text(r) label = text.split()[0] text = " ".join(text.split()[1:]) subfields.append(('s', text)) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) except IndexError: #references without 'mixed-citation' tag try: r = ref.getElementsByTagName('note')[0] subfields.append(('s', xml_to_text(r))) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) except IndexError: #references without 'note' tag subfields.append(('s', xml_to_text(ref))) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,)) logger.warning("Record %s doesn't contain PDF file." % (doi,)) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,)) logger.warning("Record %s doesn't contain PDF/A file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[ ('c', super(NLMParser, self).get_publication_date(xml, logger)) ]) journal, issn, volume, issue, first_page, last_page, year, doi = super( NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[ ('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/') ]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append( ('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi, )) logger.warning("Record %s doesn't contain PDF file." % (doi, )) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception( alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi, )) logger.warning("Record %s doesn't contain PDF/A file." % (doi, )) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation') affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID')[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append( ('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, xml_file): """ Reads a xml file in JATS format and returns a xml string in marc format """ self.document = parse(xml_file) if get_value_in_tag(self.document, "meta"): raise ApsPackageXMLError("The XML format of %s is not correct" % (xml_file, )) page_count = self._get_page_count() rec = create_record() if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) pacscodes = self._get_pacscodes() for pacscode in pacscodes: record_add_field(rec, '084', subfields=[('2', 'PACS'), ('a', pacscode)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'APS'), ('a', subject)]) keywords = self._get_keywords() if keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', ', '.join(keywords)), ('9', 'author')]) title, subtitle, _ = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) journal, volume, issue, year, start_date, doi,\ article_id, _, _ = self._get_publication_information() if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'APS')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) c_holder, c_year, c_statement = self._get_copyright() c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('y', year), ('c', article_id)]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_authors(rec) self._add_references(rec) try: return record_xml_output(rec) except UnicodeDecodeError: sys.stderr.write("""Found a bad char in the file for the article """ + doi) return ""
def get_record(self, filename, ref_extract_callback=None): """Get the MARCXML of the files in xaml_jp directory. :param filename: the name of the file to parse. :type filename: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(filename) article_type = self._get_article_type() if article_type not in [ 'research-article', 'corrected-article', 'original-article', 'introduction', 'letter', 'correction', 'addendum', 'review-article', 'rapid-communications' ]: return "" rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: title = fix_title_capitalization(title) subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: abstract = convert_html_subscripts_to_latex(abstract) record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'World Scientific')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) if article_type == 'correction': subfields.append(('m', 'Erratum')) elif article_type == 'addendum': subfields.append(('m', 'Addendum')) record_add_field(rec, '773', subfields=subfields) collections = self.get_collection(journal) for collection in collections: record_add_field(rec, '980', subfields=[collection]) self._add_authors(rec) if article_type in ['correction', 'addendum']: related_article = self._get_related_article() if related_article: record_add_field(rec, '024', ind1='7', subfields=[('a', related_article), ('2', 'DOI')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def main(args): """Run the filtering.""" if len(args) != 1: print("Usage: python bibfilter_oaipos2inspire.py input_filename") sys.exit(1) input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query, )) results = perform_request_search(p=query, of="id") url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'PoS server')]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url, )) break if not found: error_records.append(rec) # upload to FTP if not CFG_POS_DEBUG: tempfile_path = join(mkdtemp(), "{0}.xml".format(contribution)) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename, ) append_filename = "%s.append.xml" % (input_filename, ) errors_filename = "%s.errors.xml" % (input_filename, ) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, out_folder) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len( error_records) subject = "PoS Harvest results: " + datetime.now().strftime( "%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl, ) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml) if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) authors = self.get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) page_count = self.get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = self.get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) if keywords['other']: for keyword in keywords['other']: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf') try: open(pdf_path) record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) except: register_exception(alert_admin=True) logger.error("No PDF for paper: %s" % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def main(args): """Run the filtering.""" if len(args) != 1: print("Usage: python bibfilter_oaipos2inspire.py input_filename") sys.exit(1) input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") url = urljoin(base_url, '/contribution?id=%s' % identifier) session = requests.session() try: r = session.get(url, timeout=60) except (ConnectionError, Timeout): register_exception() error_records.append(rec) continue parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('/pdf'): # handle relative URLs url = urljoin(base_url, url) found = True filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[ ('u', url), ('y', 'PoS server') ]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) # upload to FTP if not CFG_POS_DEBUG: tempfile_path = join(mkdtemp(), "{0}.xml".format(contribution)) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, out_folder) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl,) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def get_record(self, filename, ref_extract_callback=None): """Get the MARCXML of the files in xaml_jp directory. :param filename: the name of the file to parse. :type filename: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(filename) article_type = self._get_article_type() if article_type not in ['research-article', 'corrected-article', 'original-article', 'introduction', 'letter', 'correction', 'addendum', 'review-article', 'rapid-communications']: return "" rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: title = fix_title_capitalization(title) subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: abstract = convert_html_subscripts_to_latex(abstract) record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'World Scientific')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) if article_type == 'correction': subfields.append(('m', 'Erratum')) elif article_type == 'addendum': subfields.append(('m', 'Addendum')) record_add_field(rec, '773', subfields=subfields) collections = self.get_collection(journal) for collection in collections: record_add_field(rec, '980', subfields=[collection]) self._add_authors(rec) if article_type in ['correction', 'addendum']: related_article = self._get_related_article() if related_article: record_add_field(rec, '024', ind1='7', subfields=[('a', related_article), ('2', 'DOI')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, f_path, publisher=None, collection=None, logger=None): #path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, '260', subfields=[('c', publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')]) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) record_add_field(rec, "300", subfields=[('a', pages)]) subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('c', first_page), ('y', year)]) record_add_field(rec, '773', subfields=subfields) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) folder_name = join('/', *(f_path.split('/')[0:-1])) pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf' pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')]) record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)]) return record_xml_output(rec)
def get_record(self, xml_file): """ Reads a xml file in JATS format and returns a xml string in marc format """ self.document = parse(xml_file) if get_value_in_tag(self.document, "meta"): raise ApsPackageXMLError("The XML format of %s is not correct" % (xml_file,)) page_count = self._get_page_count() rec = create_record() if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) pacscodes = self._get_pacscodes() for pacscode in pacscodes: record_add_field(rec, '084', subfields=[('2', 'PACS'), ('a', pacscode)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'APS'), ('a', subject)]) keywords = self._get_keywords() if keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', ', '.join(keywords)), ('9', 'author')]) title, subtitle, _ = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) journal, volume, issue, year, start_date, doi,\ article_id, _, _ = self._get_publication_information() if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'APS')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) c_holder, c_year, c_statement = self._get_copyright() c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('y', year), ('c', article_id)]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_authors(rec) self._add_references(rec) try: return record_xml_output(rec) except UnicodeDecodeError: sys.stderr.write("""Found a bad char in the file for the article """ + doi) return ""