def check_records(records, empty=False): fields = ['100', '700'] #filepath = "/opt/invenio/var/data/files/g0/" #filepath2 = "/opt/invenio/var/data/files/g1/" filepath = '/opt/invenio/var/data/files/' filepaths = os.listdir(filepath) for record in records: first_author = True if is_elsevier(record): doc_ids = get_doc_ids(int(record.record_id)) for doc_id in doc_ids: # try: # latest_file = get_latest_file(filepath + str(doc_id) + '/') # except: # latest_file = get_latest_file(filepath2 + str(doc_id) + '/') latest_file = None for folder in filepaths: try: latest_file = get_latest_file(filepath + '/' + folder + '/' + str(doc_id) + '/') if latest_file: break except: print "No folder with name %s in %s directory" % (doc_id, folder) try: xml = parse(latest_file) except: record.warn("Problem parssing XML file. Aborting") break authors = get_authors(xml) delete_fields(record, fields) for author in authors: field = '100' if first_author else '700' first_author = False subfields = [] author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) author_name = ('a', '%s, %s' % author_name) subfields.append(author_name) if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) record.add_field(field+'__', value='', subfields=subfields)
def check_records(records): for record in records: if is_springer(record): rec_doc = BibRecDocs(int(record.record_id)) rec_docs = rec_doc.list_latest_files() for doc in rec_docs: if doc.get_format() == '.xml': f = open(doc.get_full_path()) content = f.read() try: del record['100'] del record['700'] record.amended = True except: pass first_author = True try: if "-//NLM//DTD JATS" in content: jats = JATSParser() authors = jats.get_authors(parseString(content)) else: app = NLMParser() authors = app.get_authors(parseString(content)) except: record.warn('Problem with parsing XML.') continue for author in authors: if author.get('surname'): subfields = [ ('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', ''))) ] else: subfields = [('a', '%s' % (author.get('name', ''))) ] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record.add_field('100__', value='', subfields=subfields) first_author = False else: record.add_field('700__', value='', subfields=subfields)
def check_records(records): for record in records: if is_springer(record): rec_doc = BibRecDocs(int(record.record_id)) rec_docs = rec_doc.list_latest_files() for doc in rec_docs: if doc.get_format() == '.xml': f = open(doc.get_full_path()) content = f.read() try: del record['100'] del record['700'] record.amended = True except: pass first_author = True try: if "-//NLM//DTD JATS" in content: jats = JATSParser() authors = jats.get_authors(parseString(content)) else: app = NLMParser() authors = app.get_authors(parseString(content)) except: record.warn('Problem with parsing XML.') continue for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record.add_field('100__', value='', subfields=subfields) first_author = False else: record.add_field('700__', value='', subfields=subfields)
def check_records(records, empty=False): fields = ['100', '700'] filepath = "/opt/invenio/var/data/files/g0/" first_author = True for record in records: if is_elsevier(record): doc_ids = get_doc_ids(int(record.record_id)) for doc_id in doc_ids: latest_file = get_latest_file(filepath + str(doc_id) + '/') xml = parse(latest_file) authors = get_authors(xml) delete_fields(record, fields) for author in authors: field = '100' if first_author else '700' first_author = False subfields = [] author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) author_name = ('a', '%s, %s' % author_name) subfields.append(author_name) if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) record.add_field(field+'__', value='', subfields=subfields)
def check_records(records, empty=False): fields = ['100', '700'] filepath = "/opt/invenio/var/data/files/g0/" first_author = True for record in records: if is_elsevier(record): doc_ids = get_doc_ids(int(record.record_id)) for doc_id in doc_ids: latest_file = get_latest_file(filepath + str(doc_id) + '/') xml = parse(latest_file) authors = get_authors(xml) delete_fields(record, fields) for author in authors: field = '100' if first_author else '700' first_author = False subfields = [] author_name = (author['surname'], author.get('given_name') or author.get('initials')) author_name = ('a', '%s, %s' % author_name) subfields.append(author_name) if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) record.add_field(field + '__', value='', subfields=subfields)
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field( rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file( ".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,)) logger.warning("Record %s doesn't contain PDF file." % (doi,)) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,)) logger.warning("Record %s doesn't contain PDF/A file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml) if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) authors = self.get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) page_count = self.get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = self.get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) if keywords['other']: for keyword in keywords['other']: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf') try: open(pdf_path) record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) except: register_exception(alert_admin=True) logger.error("No PDF for paper: %s" % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record(self, path=None, no_pdf=False, test=False, refextract_callback=None): """Convert a record to MARCXML format. :param path: path to a record. :type path: string :param test: flag to determine if it is a test call. :type test: bool :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: marcxml formated string. """ xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field(rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get('given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) # We add subjects also as author keywords subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): keyword = xml_to_text(listitem) if keyword not in keywords: keywords.append(keyword) for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) + 1 record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(xml_doc, rec, refextract_callback) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: from invenio.search_engine import perform_request_search query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, ) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: from invenio.bibdocfile import BibRecDocs prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file(".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi, )) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi, ) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, f_path, publisher=None, collection=None, logger=None): #path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, '260', subfields=[('c', publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')]) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) record_add_field(rec, "300", subfields=[('a', pages)]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('c', first_page), ('y', year)]) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) folder_name = join('/', *(f_path.split('/')[0:-1])) pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf' pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')]) record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)]) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): #path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, '260', subfields=[('c', publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')]) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) record_add_field(rec, "300", subfields=[('a', pages)]) subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('c', first_page), ('y', year)]) record_add_field(rec, '773', subfields=subfields) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) folder_name = join('/', *(f_path.split('/')[0:-1])) pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf' pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')]) record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)]) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[ ('c', super(NLMParser, self).get_publication_date(xml, logger)) ]) journal, issn, volume, issue, first_page, last_page, year, doi = super( NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[ ('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/') ]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append( ('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi, )) logger.warning("Record %s doesn't contain PDF file." % (doi, )) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception( alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi, )) logger.warning("Record %s doesn't contain PDF/A file." % (doi, )) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def check_records(records, empty=False): fields = ['100', '700'] #filepath = "/opt/invenio/var/data/files/g0/" #filepath2 = "/opt/invenio/var/data/files/g1/" filepath = '/opt/invenio/var/data/files/' filepaths = os.listdir(filepath) for record in records: first_author = True if is_elsevier(record): doc_ids = get_doc_ids(int(record.record_id)) for doc_id in doc_ids: # try: # latest_file = get_latest_file(filepath + str(doc_id) + '/') # except: # latest_file = get_latest_file(filepath2 + str(doc_id) + '/') latest_file = None for folder in filepaths: try: latest_file = get_latest_file(filepath + '/' + folder + '/' + str(doc_id) + '/') if latest_file: break except: print "No folder with name %s in %s directory" % ( doc_id, folder) try: xml = parse(latest_file) except: record.warn("Problem parssing XML file. Aborting") break authors = get_authors(xml) delete_fields(record, fields) for author in authors: field = '100' if first_author else '700' first_author = False subfields = [] author_name = (author['surname'], author.get('given_name') or author.get('initials')) author_name = ('a', '%s, %s' % author_name) subfields.append(author_name) if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) record.add_field(field + '__', value='', subfields=subfields)
def get_record(self, f_path, publisher=None, collection=None, logger=None): # path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, "245", subfields=[("a", title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, "260", subfields=[("c", publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, "037", subfields=[("a", arxiv_id), ("9", "arXiv")]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [("a", "%s, %s" % (author["surname"], author.get("given_name") or author.get("initials")))] if "orcid" in author: subfields.append(("j", author["orcid"])) if "affiliation" in author: for aff in author["affiliation"]: subfields.append(("v", aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, "100", subfields=subfields) first_author = False else: record_add_field(rec, "700", subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, "520", subfields=[("a", abstract)]) record_add_field( rec, "540", subfields=[("a", "CC-BY-4.0"), ("u", "http://creativecommons.org/licenses/by/4.0/")] ) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, "542", subfields=[("f", copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, "653", ind1="1", subfields=[("a", keyword), ("9", "author")]) record_add_field(rec, "300", subfields=[("a", pages)]) subfields = filter( lambda x: x[1] and x[1] != "-", [("p", journal), ("v", volume), ("c", first_page), ("y", year)] ) record_add_field(rec, "773", subfields=subfields) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(("a", doi)) for author in authors: subfields.append(("h", author)) if issue: subfields.append(("n", issue)) if label: subfields.append(("o", label)) if page: subfields.append(("p", page)) subfields.append(("s", "%s %s (%s) %s" % (title, volume, year, page))) if title: subfields.append(("t", title)) if volume: subfields.append(("v", volume)) if year: subfields.append(("y", year)) if subfields: record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields) folder_name = join("/", *(f_path.split("/")[0:-1])) pdf_name = f_path.split("/")[-1].rstrip(".xml.scoap") + ".pdf" pdf_path = join(folder_name, "BodyRef/PDF", pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, "FFT", subfields=[("a", pdf_path), ("n", "main"), ("f", ".pdf;pdfa")]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, "FFT", subfields=[("a", self.get_body_ref(xml)), ("n", "main")]) record_add_field(rec, "980", subfields=[("a", collection), ("b", publisher)]) return record_xml_output(rec)