def _add_references(self, rec): for label, ref_type, text_ref, ext_link, authors, year, source, volume, page in self._get_references(): subfields = [] if label: subfields.append(("o", label)) if text_ref: ref_xml = extract_references_from_string_xml(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") subfields.append((code, data)) subfields.append(("9", "refextract")) if ref_type: subfields.append(("d", ref_type)) if text_ref: subfields.append(("m", text_ref)) if ext_link: subfields.append(("u", ext_link)) for author in authors: subfields.append(("h", author)) if year: subfields.append(("y", year)) if source and volume and page: subfields.append(("s", source + "," + volume + "," + page)) elif source and volume: subfields.append(("s", source + "," + volume)) elif source and page: subfields.append(("s", source + "," + page)) elif source: subfields.append(("s", source)) record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields)
def test_record_add_field_fallback(self): rec = create_record() record_add_field(rec, "035", subfields=[('a', "<arXiv:1234.1242>")]) data = (u"<record><datafield ind1=\"\" ind2=\"\" tag=\"035\">" u"<subfield code=\"a\">" u"<arXiv:1234.1242></subfield></datafield></record>") self.assertEqual(record_xml_output(rec, pretty=False), data)
def test_record_add_field_fallback(self): """Test adding field with special data to record.""" rec = create_record() record_add_field(rec, "035", subfields=[('a', "<arXiv:1234.1242>")]) data = (u"<record><datafield ind1=\"\" ind2=\"\" tag=\"035\">" u"<subfield code=\"a\">" u"<arXiv:1234.1242></subfield></datafield></record>") self.assertEqual(record_xml_output(rec, pretty=False), data)
def test_record_add_field_with_special_content(self): """Test adding field with special data to record.""" rec = create_record() record_add_field(rec, "035", subfields=[('a', "4.0<as 123")]) data = (u"<record><datafield ind1=\"\" ind2=\"\" tag=\"035\">" u"<subfield code=\"a\">" u"4.0<as 123</subfield></datafield></record>") self.assertEqual(record_xml_output(rec, pretty=False), data)
def get_pdfa_record(self, path=None): from invenio.search_engine import perform_request_search xml_doc = self.get_article(path) rec = create_record() dummy, dummy, dummy, dummy, dummy, dummy, dummy,\ dummy, doi = self.get_publication_information(xml_doc) recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,)) if recid: record_add_field(rec, '001', controlfield_value=recid[0]) else: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) message = ('Adding PDF/A. No paper with this DOI: ' '%s. Trying to add it anyway.') % (doi,) self.logger.error(message) try: if exists(join(path, 'main_a-2b.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): record_add_field( rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) message = 'No PDF/A in VTEX package for record: ' + doi self.logger.debug(message) else: message = "Record %s doesn't contain PDF file." % (doi,) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) self.logger.warning(message) ## copy other formats to bibupload file if recid: from invenio.bibdocfile import BibRecDocs record = BibRecDocs(recid[0]) for bibfile in record.list_latest_files(): if bibfile.get_format() != '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', bibfile.get_full_path()), ('n', bibfile.get_name()), ('f', bibfile.get_format())] ) return record_xml_output(rec)
def _add_authors(self, rec): authors = self._get_authors() first_author = True for author in authors: subfields = [('a', author[0])] if author[1]: for aff in author[1]: subfields.append(('v', aff)) if author[2]: for email in author[2]: subfields.append(('m', email)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields)
def _attach_fulltext(self, rec, doi): url = "http://dx.doi.org/" + doi page = requests.get(url) # url after redirect url = page.url page = page.text parsed_uri = urlparse(url) domain = "{uri.scheme}://{uri.netloc}".format(uri=parsed_uri) page = BeautifulSoup(page) try: if "epjconf" in doi: div = page.body.find("div", attrs={"id": "header"}) else: div = page.body.find("div", attrs={"class": "module_background files"}) links = div.findAll("a") except AttributeError: return for pdf in links: if pdf["href"].endswith("pdf"): link_to_pdf = domain + pdf["href"] record_add_field(rec, "856", ind1="4", subfields=[("u", link_to_pdf), ("y", "EDP Sciences server")]) try: from invenio.filedownloadutils import download_url, InvenioFileDownloadError from invenio.config import CFG_EDPSCIENCE_OUT_FOLDER try: out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts") try: makedirs(out_folder) filename = join(out_folder, link_to_pdf.split("/")[-1]) except (IOError, OSError): # Problem creating folder filename = None filename = download_url( link_to_pdf, content_type="pdf", download_to_file=filename, retry_count=5, timeout=60.0 ) record_add_field( rec, "FFT", subfields=[("a", filename), ("t", "INSPIRE-PUBLIC"), ("d", "Fulltext")] ) except InvenioFileDownloadError as e: print(e) except ImportError: pass
def _attach_fulltext(self, rec, doi): url = 'http://dx.doi.org/' + doi page = requests.get(url) #url after redirect url = page.url page = page.text parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) page = BeautifulSoup(page) try: if 'epjconf' in doi: div = page.body.find('div', attrs={'id': 'header'}) else: div = page.body.find('div', attrs={ 'class': 'module_background files' }) links = div.findAll('a') except AttributeError: return for pdf in links: if pdf['href'].endswith('pdf'): link_to_pdf = domain + pdf['href'] record_add_field(rec, '856', ind1='4', subfields=[('u', link_to_pdf), ('y', 'EDP Sciences server')]) out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts") try: makedirs(out_folder) filename = join(out_folder, link_to_pdf.split('/')[-1]) except (IOError, OSError): # Problem creating folder filename = None filename = download_file(from_url=link_to_pdf, to_filename=filename, retry_count=5) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')])
def test_record_add_field(self): data = (u'In this paper we continue the study of Q -operators in' u' the six-vertex model and its higher spin generalizations.' u' In [1] we derived a new expression for the higher spin R' u' -matrix associated with the affine quantum algebra ' u'<math xmlns="http://www.w3.org/1998/Math/MathML" altimg="si1.gif">' u'<msub><mrow><mi>U</mi></mrow><mrow><mi>q</mi></mrow></msub>' u'<mo stretchy="false">(</mo><mover accent="true"><mrow><mrow>' u'<mi mathvariant="italic">sl</mi></mrow><mo stretchy="false">' u'(</mo><mn>2</mn><mo stretchy="false">)</mo></mrow><mrow><mo>' u'^</mo></mrow></mover><mo stretchy="false">)</mo></math>' u' . Taking a special limit in this R -matrix we obtained new' u' formulas for the Q -operators acting in the tensor product' u' of representation spaces with arbitrary complex spin.') rec = create_record() record_add_field(rec, '520', subfields=[('a', data)]) data = (u"<record><datafield ind1=\"\" ind2=\"\" tag=\"520\">" u"<subfield code=\"a\">") + data data += u"</subfield></datafield></record>" self.assertEqual(record_xml_output(rec, pretty=False), data)
def _add_authors(self, rec): authors = self._get_authors() affiliations = self._get_affiliations() author_emails = self._get_author_emails() first_author = True for author in authors: subfields = [('a', author[0])] if author[1]: for aff in author[1]: subfields.append(('v', affiliations[aff])) if author[2]: for note in author[2]: for email in author_emails[note]: if email: subfields.append(('m', email)) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields)
def test_record_add_field(self): """Test adding field to record.""" data = (u'In this paper we continue the study of Q -operators in' u' the six-vertex model and its higher spin generalizations.' u' In [1] we derived a new expression for the higher spin R' u' -matrix associated with the affine quantum algebra ' u'<math xmlns="http://www.w3.org/1998/Math/MathML" altimg="si1.gif">' u'<msub><mrow><mi>U</mi></mrow><mrow><mi>q</mi></mrow></msub>' u'<mo stretchy="false">(</mo><mover accent="true"><mrow><mrow>' u'<mi mathvariant="italic">sl</mi></mrow><mo stretchy="false">' u'(</mo><mn>2</mn><mo stretchy="false">)</mo></mrow><mrow><mo>' u'^</mo></mrow></mover><mo stretchy="false">)</mo></math>' u' . Taking a special limit in this R -matrix we obtained new' u' formulas for the Q -operators acting in the tensor product' u' of representation spaces with arbitrary complex spin.') rec = create_record() record_add_field(rec, '520', subfields=[('a', data)]) data = (u"<record><datafield ind1=\"\" ind2=\"\" tag=\"520\">" u"<subfield code=\"a\">") + data data += u"</subfield></datafield></record>" self.assertEqual(record_xml_output(rec, pretty=False), data)
def convert_record(record, response_date, request): header = record.getElementsByTagName("header")[0] oai_identifier = get_value_in_tag(header, "identifier") datestamp = get_value_in_tag(header, "datestamp") status = header.getAttribute("status").encode('utf8') rec = create_record() record_add_field(rec, tag="035", subfields=[('a', oai_identifier), ('u', request), ('9', 'Hindawi'), ('d', datestamp), ('h', response_date), ('m', 'marc21'), ('t', 'false')]) new = True if find_records_from_extoaiid(oai_identifier, 'Hindawi'): new = False if status == 'deleted': if new: ## deleting a record we didn't have? Who cares :-) return None, True else: record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')]) return record_xml_output(rec), False for datafield in record.getElementsByTagName("datafield"): tag = datafield.getAttribute("tag").encode('utf-8') ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' ' ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' ' subfields = [] for subfield in datafield.getElementsByTagName("subfield"): code = subfield.getAttribute("code").encode('utf-8') value = xml_to_text(subfield) subfields.append((code, value)) record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields) return record_xml_output(rec), new
def _add_references(self, rec, ref_extract_callback=None): for label, ref_type, text_ref, ext_link, authors, year, \ source, volume, page in self._get_references(): subfields = [] if label: subfields.append(('o', label)) if text_ref: if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") subfields.append((code, data)) if fields: subfields.append(('9', 'refextract')) else: subfields.append(('m', text_ref)) if ref_type: subfields.append(('d', ref_type)) if ext_link: subfields.append(('u', ext_link)) for author in authors: subfields.append(('h', author)) if year: subfields.append(('y', year)) if source and volume and page: subfields.append(('s', source + "," + volume + "," + page)) elif source and volume: subfields.append(('s', source + "," + volume)) elif source and page: subfields.append(('s', source + "," + page)) elif source: subfields.append(('s', source)) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def get_record_rich(self, filename): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName("ArticleID") for article in articles: article_type = article.getAttribute("Type") if not article_type == "Article": return "" doi = get_value_in_tag(self.document, "DOI") date = "" for tag in self.document.getElementsByTagName("Accepted"): year = get_value_in_tag(tag, "Year") month = get_value_in_tag(tag, "Month").zfill(2) day = get_value_in_tag(tag, "Day").zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName("OnlineDate"): year = get_value_in_tag(tag, "Year") month = get_value_in_tag(tag, "Month").zfill(2) day = get_value_in_tag(tag, "Day").zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, "FirstPage") last_page = get_value_in_tag(article, "LastPage") subjects = article.getElementsByTagName("Keyword") subjects = map(xml_to_text, subjects) subject = ", ".join(subjects) copyright_statement = get_value_in_tag(article, "Copyright") journal = get_value_in_tag(self.document, "JournalTitle") journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName("IssueID") for issue in issues: volume += get_value_in_tag(issue, "Volume") year = get_value_in_tag(issue, "Year") title = get_value_in_tag(self.document, "Title") authors = self.document.getElementsByTagName("Author") affiliations = self.document.getElementsByTagName("Affiliation") def affiliation_pair(a): return a.getAttribute("ID"), get_value_in_tag(a, "UnstructuredAffiliation") affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, "LastName") first_name = get_value_in_tag(a, "FirstName") middle_name = get_value_in_tag(a, "MiddleName") if middle_name: name = "%s, %s %s" % (surname, first_name, middle_name) else: name = "%s, %s" % (surname, first_name) try: affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label") affiliation = affiliations[affid] except IndexError: affiliation = "" except KeyError: affiliation = "" return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, "Abstract") references = self.document.getElementsByTagName("Bibliomixed") for reference in references: subfields = [] label = reference.getAttribute("N") if label: subfields.append(("o", label)) bibliosets = reference.getElementsByTagName("Biblioset") for tag in bibliosets: ref_year = get_value_in_tag(tag, "Date") ref_journal = get_value_in_tag(tag, "JournalShortTitle") ref_journal, ref_volume = fix_journal_name(ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, "Volume") ref_page = get_value_in_tag(tag, "ArtPageNums") if ref_year: subfields.append(("y", ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(("s", "%s,%s,%s" % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) ref_xml = extract_references_from_string_xml(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(("9", "refextract")) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == "m" and bibliosets: continue else: subfields.append((code, data)) if subfields: record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields) if title: record_add_field(rec, "245", subfields=[("a", title)]) if date: record_add_field(rec, "260", subfields=[("c", date), ("t", "published")]) if doi: record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")]) if abstract: record_add_field(rec, "520", subfields=[("a", abstract), ("9", "EDPSciences")]) first_author = True for author in authors: if first_author: subfields = [("a", author[0])] if author[1]: subfields.append(("v", author[1])) record_add_field(rec, "100", subfields=subfields) first_author = False else: subfields = [("a", author[0])] if author[1]: subfields.append(("v", author[1])) record_add_field(rec, "700", subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(("s", "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, "300", subfields=[("a", str(nuber_of_pages))]) except ValueError: pass subfields.append(("c", "%s-%s" % (first_page, last_page))) if year: subfields.append(("y", year)) record_add_field(rec, "773", subfields=subfields) record_add_field(rec, "980", subfields=[("a", "HEP")]) if copyright_statement: record_add_field(rec, "542", subfields=[("f", copyright_statement)]) if subject: record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "EDPSciences"), ("a", subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml) if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) authors = self.get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) page_count = self.get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = self.get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) if keywords['other']: for keyword in keywords['other']: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))]) pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf') try: open(pdf_path) record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) except: register_exception(alert_admin=True) logger.error("No PDF for paper: %s" % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def _get_record(self, link): link = link.find('a')['href'] url = urlparse.urljoin(self.base_url, link) page = urllib2.urlopen(url) page = BeautifulSoup(page) self.content = page.body.find('div', attrs={'id': 'content'}) publication_title = self.content.find('div', {'id': 'publication-title'}) if publication_title: publication_title = publication_title.find('a').text else: publication_title = '' series_title = self._find('a', {'id': 'series-title'}) if series_title == 'NATO Science Series': series_title = 'NATO Sci.Ser.' title = self._find('h1', {'id': 'title'}) if not title: title = self._find('h1', {'class': 'ChapterTitle'}) volume = self._find('span', {'id': 'book-volume'}) if volume: volume = re.sub(r'\D', '', volume) else: volume = self._find('span', {'id': 'volume-range'}) volume = re.sub(r'\D', '', volume) issue = self._find('a', {'id': 'issue-range'}) if issue: issue = issue.split()[1] year = self._find('span', {'id': 'copyright-year'}) if not year: year = self._find( 'dd', {'id': 'abstract-about-book-chapter-copyright-year'}) year = re.sub(r'\D', '', year) if not year: year = self._find('dd', {'id': 'abstract-about-cover-date'}) year = re.sub(r'\D', '', year)[:4] abstract = self._find('div', {'class': 'abstract-content formatted'}) page_range = self._find('span', {'id': 'page-range'}) if not page_range: page_range = self._find( 'dd', {'id': 'abstract-about-book-chapter-page-ranges'}) if page_range: page_range = page_range.replace('pp', '').strip() #publisher = self._find('dd', {'id': 'abstract-about-publisher'}) copyright_holder = self._find( 'dd', {'id': 'abstract-about-book-copyright-holder'}) #issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'}) doi = self._find('dd', {'class': 'doi'}) #subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'}) #online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'}) #print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'}) editors = [] editors_affiliations = [] for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}): editors.append(editor.find('a').text) try: editors_affiliations.append(editor.find('sup')['title']) except KeyError: editors_affiliations.append('') except TypeError: editors_affiliations.append('') authors = [] authors_affiliations = [] summary = self.content.find('div', attrs={'class': 'summary'}) for author in summary.findAll('li', attrs={'itemprop': 'author'}): author_name = author.find('a').text author_names = [] author_names.append(author_name.split()[-1] + ",") author_names += author_name.split()[:-1] author_name = " ".join(author_names) author_name = collapse_initials(author_name) authors.append(author_name) try: authors_affiliations.append(author.find('sup')['title']) except KeyError: authors_affiliations.append('') except TypeError: authors_affiliations.append('') try: attrs = {'id': 'abstract-actions-download-chapter-pdf-link'} fulltext = self.content.find('a', attrs=attrs) fulltext = urlparse.urljoin(self.base_url, fulltext['href']) except TypeError: fulltext = '' #create Marc record rec = create_record() if title: record_add_field(rec, '245', subfields=[('a', title)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) first_author = True for i in range(len(authors)): subfields = [('a', '%s' % (authors[i]))] if authors_affiliations[i]: subfields.append(('v', authors_affiliations[i])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')]) if copyright_holder: record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)]) if not series_title: series_title = publication_title subfields = [] if series_title: subfields.append(('p', series_title)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if page_range: subfields.append(('c', page_range)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'BookChapter')]) if fulltext: record_add_field(rec, 'FFT', subfields=[('a', fulltext), ('t', 'Springer'), ('d', 'Fulltext')]) recordString = record_xml_output(rec) #removes whitespace except spaces recordString = re.sub(r'[\n\t\r\f\v]', '', recordString) #removes two or more consecutive spaces recordString = re.sub(r' {2,}', '', recordString) record = parseString(recordString) references = [] ref_fields = [] references_container = self.content.find( 'div', attrs={'id': 'abstract-references'}) if references_container: references = references_container.findAll('li') for reference in references: try: from invenio.refextract_api import ( extract_references_from_string_xml) ref = xml_to_text(parseString(reference.decode())) #removes the space between hep-th/ and the identifier ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref) ref = extract_references_from_string_xml(ref) ref = parseString(ref) for field in ref.childNodes: for subfield in field.getElementsByTagName('subfield'): if subfield.getAttribute('code') == 'm': text = subfield.firstChild.data text = re.sub(r'\[?arXiv:', '', text) text = text.replace('CrossRef', '') if text.startswith(': '): text = text[2:] if text: subfield.firstChild.data = text else: parentNode = subfield.parentNode parentNode.removeChild(subfield) ref_fields.append(field.firstChild) except ImportError: record_add_field(rec, '999', ind1='C', ind2='5', subfields=[('m', reference.decode())]) for field in ref_fields: record.firstChild.appendChild(field) return record.firstChild
def get_record(self, f_path, publisher=None, collection=None, logger=None): # path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, "245", subfields=[("a", title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, "260", subfields=[("c", publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, "037", subfields=[("a", arxiv_id), ("9", "arXiv")]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [("a", "%s, %s" % (author["surname"], author.get("given_name") or author.get("initials")))] if "orcid" in author: subfields.append(("j", author["orcid"])) if "affiliation" in author: for aff in author["affiliation"]: subfields.append(("v", aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, "100", subfields=subfields) first_author = False else: record_add_field(rec, "700", subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, "520", subfields=[("a", abstract)]) record_add_field( rec, "540", subfields=[("a", "CC-BY-4.0"), ("u", "http://creativecommons.org/licenses/by/4.0/")] ) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, "542", subfields=[("f", copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, "653", ind1="1", subfields=[("a", keyword), ("9", "author")]) record_add_field(rec, "300", subfields=[("a", pages)]) subfields = filter( lambda x: x[1] and x[1] != "-", [("p", journal), ("v", volume), ("c", first_page), ("y", year)] ) record_add_field(rec, "773", subfields=subfields) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(("a", doi)) for author in authors: subfields.append(("h", author)) if issue: subfields.append(("n", issue)) if label: subfields.append(("o", label)) if page: subfields.append(("p", page)) subfields.append(("s", "%s %s (%s) %s" % (title, volume, year, page))) if title: subfields.append(("t", title)) if volume: subfields.append(("v", volume)) if year: subfields.append(("y", year)) if subfields: record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields) folder_name = join("/", *(f_path.split("/")[0:-1])) pdf_name = f_path.split("/")[-1].rstrip(".xml.scoap") + ".pdf" pdf_path = join(folder_name, "BodyRef/PDF", pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, "FFT", subfields=[("a", pdf_path), ("n", "main"), ("f", ".pdf;pdfa")]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, "FFT", subfields=[("a", self.get_body_ref(xml)), ("n", "main")]) record_add_field(rec, "980", subfields=[("a", collection), ("b", publisher)]) return record_xml_output(rec)
def get_record(self, f_path, publisher=None, collection=None, logger=None): #path = abspath(join(f_path, pardir)) xml = self.get_article(f_path) rec = create_record() title = self.get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) publication_date = self.get_publication_date(xml) if publication_date: record_add_field(rec, '260', subfields=[('c', publication_date)]) journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) arxiv_id = self.get_arxiv_id(xml) if arxiv_id: record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')]) if logger: logger.info("Creating record: %s %s" % (f_path, doi)) authors = self.get_authors(xml) first_author = True for author in authors: subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')]) copyright = self.get_copyright(xml) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = self.get_keywords(xml) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) record_add_field(rec, "300", subfields=[('a', pages)]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('c', first_page), ('y', year)]) references = self.get_references(xml) for label, authors, doi, issue, page, title, volume, year in references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) folder_name = join('/', *(f_path.split('/')[0:-1])) pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf' pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name) print pdf_path if exists(pdf_path): record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) else: # Don't know why it doesn't work???????????? # register_exception(alert_admin=True) if logger: logger.error("Record %s doesn't contain PDF file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')]) record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)]) return record_xml_output(rec)
def get_record(self, fileName, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_jp directory :param fileName: the name of the file to parse. :type fileName: string :param refextract_callback: callback to be used to extract unstructured references. It should return a marcxml formated string of the reference. :type refextract_callback: callable :returns: a string with the marc xml version of the file. """ self.document = parse(fileName) article_type = self._get_article_type() if article_type not in ['research-article', 'introduction', 'letter']: return '' rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) subjects = self.document.getElementsByTagName('kwd') subjects = map(xml_to_text, subjects) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, '500', subfields=[('a', note)]) for subject in subjects: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, volume, issue, year, date, doi, page,\ fpage, lpage = self._get_publication_information() astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.'] if journal in astronomy_journals: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'INSPIRE'), ('a', 'Astrophysics')]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() abstract = self._format_abstract(abstract) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) if license_type == 'open-access': self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, '300', subfields=[('a', number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) subfields = [] if journal: subfields.append(('p', journal)) if issue: subfields.append(('n', issue)) if volume: subfields.append(('v', volume)) if fpage and lpage: subfields.append(('c', '%s-%s' % (fpage, lpage))) elif page: subfields.append(('c', page)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) conference = '' for tag in self.document.getElementsByTagName('conference'): conference = xml_to_text(tag) if conference: record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '500', subfields=[('a', conference)]) self._add_references(rec, ref_extract_callback) self._add_authors(rec) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, xml_file): """ Reads a xml file in JATS format and returns a xml string in marc format """ self.document = parse(xml_file) if get_value_in_tag(self.document, "meta"): raise ApsPackageXMLError("The XML format of %s is not correct" % (xml_file,)) page_count = self._get_page_count() rec = create_record() if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) pacscodes = self._get_pacscodes() for pacscode in pacscodes: record_add_field(rec, '084', subfields=[('2', 'PACS'), ('a', pacscode)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'APS'), ('a', subject)]) keywords = self._get_keywords() if keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', ', '.join(keywords)), ('9', 'author')]) title, subtitle, _ = self._get_title() subfields = [] if subtitle: subfields.append(('b', subtitle)) if title: subfields.append(('a', title)) record_add_field(rec, '245', subfields=subfields) journal, volume, issue, year, start_date, doi,\ article_id, _, _ = self._get_publication_information() if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) abstract = self._get_abstract() if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'APS')]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(('a', license)) if license_url: subfields.append(('u', license_url)) if subfields: record_add_field(rec, '540', subfields=subfields) c_holder, c_year, c_statement = self._get_copyright() c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, '542', subfields=[('d', c_holder), ('g', c_year), ('e', 'Article')]) elif c_statement: record_add_field(rec, '542', subfields=[('f', c_statement), ('e', 'Article')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('y', year), ('c', article_id)]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_authors(rec) self._add_references(rec) try: return record_xml_output(rec) except UnicodeDecodeError: sys.stderr.write("""Found a bad char in the file for the article """ + doi) return ""
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query, )) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'PoS server')]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url, )) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution, ) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename, ) append_filename = "%s.append.xml" % (input_filename, ) errors_filename = "%s.errors.xml" % (input_filename, ) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len( error_records) subject = "PoS Harvest results: " + datetime.now().strftime( "%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl, ) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[ ('u', url), ('y', 'PoS server') ]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution,) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl,) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def get_record_rich(self, filename, ref_extract_callback=None): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName('ArticleID') for article in articles: article_type = article.getAttribute('Type') if not article_type == 'Article': return '' doi = get_value_in_tag(self.document, 'DOI') date = '' for tag in self.document.getElementsByTagName('Accepted'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName('OnlineDate'): year = get_value_in_tag(tag, 'Year') month = get_value_in_tag(tag, 'Month').zfill(2) day = get_value_in_tag(tag, 'Day').zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, 'FirstPage') last_page = get_value_in_tag(article, 'LastPage') subjects = article.getElementsByTagName('Keyword') subjects = map(xml_to_text, subjects) subject = ', '.join(subjects) copyright_statement = get_value_in_tag(article, 'Copyright') journal = get_value_in_tag(self.document, 'JournalTitle') journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName('IssueID') for issue in issues: volume += get_value_in_tag(issue, 'Volume') year = get_value_in_tag(issue, 'Year') title = get_value_in_tag(self.document, 'Title') authors = self.document.getElementsByTagName('Author') affiliations = self.document.getElementsByTagName('Affiliation') def affiliation_pair(a): return a.getAttribute('ID'), get_value_in_tag( a, 'UnstructuredAffiliation' ) affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, 'LastName') first_name = get_value_in_tag(a, 'FirstName') middle_name = get_value_in_tag(a, 'MiddleName') if middle_name: name = '%s, %s %s' % (surname, first_name, middle_name) else: name = '%s, %s' % (surname, first_name) try: affid = a.getElementsByTagName( 'AffiliationID' )[0].getAttribute('Label') affiliation = affiliations[affid] except IndexError: affiliation = '' except KeyError: affiliation = '' return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, 'Abstract') references = self.document.getElementsByTagName('Bibliomixed') for reference in references: subfields = [] label = reference.getAttribute('N') if label: subfields.append(('o', label)) bibliosets = reference.getElementsByTagName('Biblioset') for tag in bibliosets: ref_year = get_value_in_tag(tag, 'Date') ref_journal = get_value_in_tag(tag, 'JournalShortTitle') ref_journal, ref_volume = fix_journal_name( ref_journal, self.journal_mappings ) ref_volume += get_value_in_tag(tag, 'Volume') ref_page = get_value_in_tag(tag, 'ArtPageNums') if ref_year: subfields.append(('y', ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(('s', '%s,%s,%s' % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) if ref_extract_callback: ref_xml = ref_extract_callback(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'm' and bibliosets: continue else: subfields.append((code, data)) else: subfields.append(('m', text_ref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) if title: record_add_field(rec, '245', subfields=[('a', title)]) if date: record_add_field(rec, '260', subfields=[('c', date), ('t', 'published')]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'EDPSciences')]) first_author = True for author in authors: if first_author: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '100', subfields=subfields) first_author = False else: subfields = [('a', author[0])] if author[1]: subfields.append(('v', author[1])) record_add_field(rec, '700', subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(nuber_of_pages))]) except ValueError: pass subfields.append(('c', '%s-%s' % (first_page, last_page))) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) if copyright_statement: record_add_field(rec, '542', subfields=[('f', copyright_statement)]) if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'EDPSciences'), ('a', subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, fileName): """ Gets the Marc xml of the files in xaml_jp directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(fileName) article_type = self._get_article_type() if article_type not in ["research-article", "introduction", "letter"]: return "" rec = create_record() title, subtitle, notes = self._get_title() subfields = [] if subtitle: subfields.append(("b", subtitle)) if title: subfields.append(("a", title)) record_add_field(rec, "245", subfields=subfields) subjects = self.document.getElementsByTagName("kwd") subjects = map(xml_to_text, subjects) for note_id in notes: note = self._get_note(note_id) if note: record_add_field(rec, "500", subfields=[("a", note)]) for subject in subjects: record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "EDPSciences"), ("a", subject)]) keywords = self._get_keywords() for keyword in keywords: record_add_field(rec, "653", ind1="1", subfields=[("a", keyword), ("9", "author")]) journal, volume, issue, year, date, doi, page, fpage, lpage = self._get_publication_information() astronomy_journals = ["EAS Publ.Ser.", "Astron.Astrophys."] if journal in astronomy_journals: record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "INSPIRE"), ("a", "Astrophysics")]) if date: record_add_field(rec, "260", subfields=[("c", date), ("t", "published")]) if doi: record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")]) abstract = self._get_abstract() abstract = self._format_abstract(abstract) if abstract: record_add_field(rec, "520", subfields=[("a", abstract), ("9", "EDPSciences")]) license, license_type, license_url = self._get_license() subfields = [] if license: subfields.append(("a", license)) if license_url: subfields.append(("u", license_url)) if subfields: record_add_field(rec, "540", subfields=subfields) if license_type == "open-access": self._attach_fulltext(rec, doi) number_of_pages = self._get_page_count() if number_of_pages: record_add_field(rec, "300", subfields=[("a", number_of_pages)]) c_holder, c_year, c_statement = self._get_copyright() if c_holder and c_year: record_add_field(rec, "542", subfields=[("d", c_holder), ("g", c_year), ("e", "Article")]) elif c_statement: record_add_field(rec, "542", subfields=[("f", c_statement), ("e", "Article")]) subfields = [] if journal: subfields.append(("p", journal)) if issue: subfields.append(("n", issue)) if volume: subfields.append(("v", volume)) if fpage and lpage: subfields.append(("c", "%s-%s" % (fpage, lpage))) elif page: subfields.append(("c", page)) if year: subfields.append(("y", year)) record_add_field(rec, "773", subfields=subfields) record_add_field(rec, "980", subfields=[("a", "HEP")]) conference = "" for tag in self.document.getElementsByTagName("conference"): conference = xml_to_text(tag) if conference: record_add_field(rec, "980", subfields=[("a", "ConferencePaper")]) record_add_field(rec, "500", subfields=[("a", conference)]) self._add_references(rec) self._add_authors(rec) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def get_record(self, record): """ Reads a dom xml element in oaidc format and returns the bibrecord object """ self.document = record rec = create_record() language = self._get_language() if language and language != 'en': record_add_field(rec, '041', subfields=[('a', language)]) publisher = self._get_publisher() date = self._get_date() if publisher and date: record_add_field(rec, '260', subfields=[('b', publisher), ('c', date)]) elif publisher: record_add_field(rec, '260', subfields=[('b', publisher)]) elif date: record_add_field(rec, '260', subfields=[('c', date)]) title = self._get_title() if title: record_add_field(rec, '245', subfields=[('a', title)]) record_copyright = self._get_copyright() if record_copyright: record_add_field(rec, '540', subfields=[('a', record_copyright)]) subject = self._get_subject() if subject: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('a', subject), ('2', 'PoS')]) authors = self._get_authors() first_author = True for author in authors: subfields = [('a', author)] if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) identifier = self.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] record_add_field(rec, '773', subfields=[('p', 'PoS'), ('v', conference.replace(' ', '')), ('c', contribution), ('y', date[:4])]) record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) return rec
def _add_references(self, rec): """ Adds the reference to the record """ for ref in self.document.getElementsByTagName('ref'): for ref_type, doi, authors, collaboration, journal, volume, page, year,\ label, arxiv, publisher, institution, unstructured_text,\ external_link, report_no, editors in self._get_reference(ref): subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) for editor in editors: subfields.append(('e', editor)) if year: subfields.append(('y', year)) if unstructured_text: if page: subfields.append(('m', unstructured_text + ', ' + page)) else: subfields.append(('m', unstructured_text)) if collaboration: subfields.append(('c', collaboration)) if institution: subfields.append(('m', institution)) if publisher: subfields.append(('p', publisher)) if arxiv: subfields.append(('r', arxiv)) if report_no: subfields.append(('r', report_no)) if external_link: subfields.append(('u', external_link)) if label: subfields.append(('o', label)) if ref_type == 'book': if journal: subfields.append(('t', journal)) if volume: subfields.append(('m', volume)) elif page and not unstructured_text: subfields.append(('m', page)) else: if volume and page: subfields.append(('s', journal + "," + volume + "," + page)) elif journal: subfields.append(('t', journal)) if ref_type: subfields.append(('d', ref_type)) if not subfields: #misc-type references try: r = ref.getElementsByTagName('mixed-citation')[0] text = xml_to_text(r) label = text.split()[0] text = " ".join(text.split()[1:]) subfields.append(('s', text)) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) except IndexError: #references without 'mixed-citation' tag try: r = ref.getElementsByTagName('note')[0] subfields.append(('s', xml_to_text(r))) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) except IndexError: #references without 'note' tag subfields.append(('s', xml_to_text(ref))) record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def _add_references(self, xml_doc, rec): if self.CONSYN: for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link, isjournal, comment, journal, publisher,\ editors, book_title in self.get_references(xml_doc): subfields = [] if textref and not authors: textref = textref.replace('\"', '\'') ref_xml = extract_references_from_string_xml(textref) dom = xml.dom.minidom.parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(('9', 'refextract')) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 's': try: journal = data.split(',')[0] journal, vol = fix_journal_name(journal, self.journal_mappings) vol += data.split(',')[1] try: page = data.split(',')[2] journal = journal + "," + vol + "," + page subfields.append(('s', journal)) except IndexError: journal = journal + "," + vol subfields.append(('s', journal)) except IndexError: subfields.append(('s', data)) elif code == 'r': data = data.replace(u'\u05BE', '-') data = data.replace(u'\u1806', '-') data = data.replace(u'\u2E3A', '-') data = data.replace(u'\u2E3B', '-') data = unidecode(data) data = data.replace('--', '-') subfields.append(('r', data)) else: subfields.append((code, data)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if ext_link: ext_link = ext_link.replace(u'\u05BE', '-') ext_link = ext_link.replace(u'\u1806', '-') ext_link = ext_link.replace(u'\u2E3A', '-') ext_link = ext_link.replace(u'\u2E3B', '-') ext_link = unidecode(ext_link) ext_link = ext_link.replace('--', '-') subfields.append(('r', ext_link)) if title: subfields.append(('t', title)) elif textref: subfields.append(('m', textref)) if publisher: subfields.append(('p', publisher)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if comment: subfields.append(('m', comment)) for editor in editors: subfields.append(('e', editor)) if book_title: subfields.append(('q', book_title)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if journal: journal, vol = fix_journal_name(journal, self.journal_mappings) volume = vol + volume if volume and page: journal = journal + "," + volume + "," + page subfields.append(('s', journal)) elif volume: journal = journal + "," + volume subfields.append(('s', journal)) else: subfields.append(('s', journal)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link in self.get_references(xml_doc): subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) if ext_link: subfields.append(('r', ext_link)) if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif textref: subfields.append(('m', textref)) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
def get_record(self, f_path, publisher=None, collection=None, logger=None): xml = super(NLMParser, self).get_article(f_path) rec = create_record() title = super(NLMParser, self).get_title(xml) if title: record_add_field(rec, '245', subfields=[('a', title)]) record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))]) journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml) journal = "PTEP" # Let's override the journal information if logger: logger.info("Creating record: %s %s" % (join(f_path, pardir), doi)) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) page_count = super(NLMParser, self).get_page_count(xml) if page_count: record_add_field(rec, '300', subfields=[('a', page_count)]) arxiv = self.get_arxiv_id(xml) if arxiv: record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))]) authors = super(NLMParser, self).get_authors(xml) first_author = True for author in authors: if author.get('surname'): subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))] else: subfields = [('a', '%s' % (author.get('name', '')))] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = super(NLMParser, self).get_abstract(xml) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)]) record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')]) copyright = super(NLMParser, self).get_copyright(xml, logger) if copyright: record_add_field(rec, '542', subfields=[('f', copyright)]) keywords = super(NLMParser, self).get_keywords(xml) if keywords['pacs']: for keyword in keywords['pacs']: record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')]) ## Oxford is giving us bad keywords. Better ignore them. #if keywords['other']: #for keyword in keywords['other']: #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) if first_page or last_page: pages = '%s-%s' % (first_page, last_page) else: article_meta = xml.getElementsByTagName('article-meta')[0] pages = get_value_in_tag(article_meta, "elocation-id") subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) self.get_references(xml) for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references: subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if year: subfields.append(('y', year)) if ext_link: subfields.append(('r', ext_link)) # should we be strict about it? if title and volume and year and page: subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page))) elif not plain_text: subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page)))) if plain_text: subfields.append(('m', plain_text)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) f_path_pdf = f_path[:-(len('.xml'))] + '.pdf' f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf') if exists(f_path_pdf): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,)) logger.warning("Record %s doesn't contain PDF file." % (doi,)) if exists(f_path_pdfa): record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')]) else: try: raise MissingFFTError except: register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,)) logger.warning("Record %s doesn't contain PDF/A file." % (doi,)) record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')]) extra_subfields = [] if collection: extra_subfields.append(('a', collection)) if publisher: extra_subfields.append(('b', publisher)) record_add_field(rec, '980', subfields=extra_subfields) return record_xml_output(rec)
def get_record(self, path=None, no_pdf=False, test=False): xml_doc = self.get_article(path) rec = create_record() title = self.get_title(xml_doc) if title: record_add_field(rec, '245', subfields=[('a', title)]) (journal, dummy, volume, issue, first_page, last_page, year, start_date, doi) = self.get_publication_information(xml_doc, path) if not journal: journal = self.get_article_journal(xml_doc) if start_date: record_add_field(rec, '260', subfields=[('c', start_date), ('t', 'published')]) else: record_add_field( rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) license, license_url = self.get_license(xml_doc) if license and license_url: record_add_field(rec, '540', subfields=[('a', license), ('u', license_url)]) elif license_url: record_add_field(rec, '540', subfields=[('u', license_url)]) self.logger.info("Creating record: %s %s" % (path, doi)) authors = self.get_authors(xml_doc) first_author = True for author in authors: author_name = (author['surname'], author.get( 'given_name') or author.get('initials')) subfields = [('a', '%s, %s' % author_name)] if 'orcid' in author: subfields.append(('j', author['orcid'])) if 'affiliation' in author: for aff in author["affiliation"]: subfields.append(('v', aff)) if self.extract_nations: add_nations_field(subfields) if author.get('email'): subfields.append(('m', author['email'])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self.get_abstract(xml_doc) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Elsevier')]) record_copyright = self.get_copyright(xml_doc) if record_copyright: record_add_field(rec, '542', subfields=[('f', record_copyright)]) keywords = self.get_keywords(xml_doc) if self.CONSYN: for tag in xml_doc.getElementsByTagName('ce:collaboration'): collaboration = get_value_in_tag(tag, 'ce:text') if collaboration: record_add_field(rec, '710', subfields=[('g', collaboration)]) topics = [] subjects = xml_doc.getElementsByTagName('dct:subject') for subject in subjects: for listitem in subject.getElementsByTagName('rdf:li'): topics.append(xml_to_text(listitem)) if topics: record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'Elsevier'), ('a', ', '.join(topics))]) if keywords: for keyword in keywords: record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings) subfields = [] doctype = self.get_doctype(xml_doc) try: page_count = int(last_page) - int(first_page) record_add_field(rec, '300', subfields=[('a', str(page_count))]) except ValueError: # do nothing pass if doctype == 'err': subfields.append(('m', 'Erratum')) elif doctype == 'add': subfields.append(('m', 'Addendum')) elif doctype == 'pub': subfields.append(('m', 'Publisher Note')) elif doctype == 'rev': record_add_field(rec, '980', subfields=[('a', 'Review')]) if journal: subfields.append(('p', journal)) if first_page and last_page: subfields.append(('c', '%s-%s' % (first_page, last_page))) elif first_page: subfields.append(('c', first_page)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) if not test: if license: url = 'http://www.sciencedirect.com/science/article/pii/'\ + path.split('/')[-1][:-4] record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'Elsevier server')]) record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')]) else: record_add_field(rec, 'FFT', subfields=[('a', path), ('t', 'Elsevier'), ('o', 'HIDDEN')]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) else: licence = 'http://creativecommons.org/licenses/by/3.0/' record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', licence)]) if keywords: for keyword in keywords: record_add_field( rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')]) pages = '' if first_page and last_page: pages = '{0}-{1}'.format(first_page, last_page) elif first_page: pages = first_page subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal), ('v', volume), ('n', issue), ('c', pages), ('y', year)]) record_add_field(rec, '773', subfields=subfields) if not no_pdf: query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,) prev_version = perform_request_search(p=query) old_pdf = False if prev_version: prev_rec = BibRecDocs(prev_version[0]) try: pdf_path = prev_rec.get_bibdoc('main') pdf_path = pdf_path.get_file( ".pdf;pdfa", exact_docformat=True) pdf_path = pdf_path.fullpath old_pdf = True record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) message = ('Leaving previously delivered PDF/A for: ' + doi) self.logger.info(message) except: pass try: if exists(join(path, 'main_a-2b.pdf')): pdf_path = join(path, 'main_a-2b.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')]) self.logger.debug('Adding PDF/A to record: %s' % (doi,)) elif exists(join(path, 'main.pdf')): pdf_path = join(path, 'main.pdf') record_add_field(rec, 'FFT', subfields=[('a', pdf_path)]) else: if not old_pdf: message = "Record " + doi message += " doesn't contain PDF file." self.logger.warning(message) raise MissingFFTError(message) except MissingFFTError: message = "Elsevier paper: %s is missing PDF." % (doi,) register_exception(alert_admin=True, prefix=message) version = self.get_elsevier_version(find_package_name(path)) record_add_field(rec, '583', subfields=[('l', version)]) xml_path = join(path, 'main.xml') record_add_field(rec, 'FFT', subfields=[('a', xml_path)]) record_add_field(rec, '980', subfields=[('a', 'SCOAP3'), ('b', 'Elsevier')]) self._add_references(xml_doc, rec) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""
def _get_record(self, link): link = link.find('a')['href'] url = urlparse.urljoin(self.base_url, link) page = urllib2.urlopen(url) page = BeautifulSoup(page) self.content = page.body.find('div', attrs={'id': 'content'}) publication_title = self.content.find('div', {'id': 'publication-title'}) if publication_title: publication_title = publication_title.find('a').text else: publication_title = '' series_title = self._find('a', {'id': 'series-title'}) if series_title == 'NATO Science Series': series_title = 'NATO Sci.Ser.' title = self._find('h1', {'id': 'title'}) volume = self._find('span', {'id': 'book-volume'}) if volume: volume = re.sub(r'\D', '', volume) else: volume = self._find('span', {'id': 'volume-range'}) volume = re.sub(r'\D', '', volume) issue = self._find('a', {'id': 'issue-range'}) if issue: issue = issue.split()[1] year = self._find('span', {'id': 'copyright-year'}) year = re.sub(r'\D', '', year) if not year: year = self._find('dd', {'id': 'abstract-about-cover-date'}) year = re.sub(r'\D', '', year)[:4] abstract = self._find('div', {'class': 'abstract-content formatted'}) page_range = self._find('span', {'id': 'page-range'}) if page_range: page_range = page_range.replace('pp', '').strip() #publisher = self._find('dd', {'id': 'abstract-about-publisher'}) copyright_holder = self._find('dd', {'id': 'abstract-about-book-copyright-holder'}) #issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'}) doi = self._find('dd', {'class': 'doi'}) #subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'}) #online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'}) #print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'}) editors = [] editors_affiliations = [] for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}): editors.append(editor.find('a').text) try: editors_affiliations.append(editor.find('sup')['title']) except KeyError: editors_affiliations.append('') except TypeError: editors_affiliations.append('') authors = [] authors_affiliations = [] summary = self.content.find('div', attrs={'class': 'summary'}) for author in summary.findAll('li', attrs={'itemprop': 'author'}): author_name = author.find('a').text author_names = [] author_names.append(author_name.split()[-1] + ",") author_names += author_name.split()[:-1] author_name = " ".join(author_names) author_name = collapse_initials(author_name) authors.append(author_name) try: authors_affiliations.append(author.find('sup')['title']) except KeyError: authors_affiliations.append('') except TypeError: authors_affiliations.append('') try: attrs = {'id': 'abstract-actions-download-chapter-pdf-link'} fulltext = self.content.find('a', attrs=attrs) fulltext = urlparse.urljoin(self.base_url, fulltext['href']) except TypeError: fulltext = '' #create Marc record rec = create_record() if title: record_add_field(rec, '245', subfields=[('a', title)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) first_author = True for i in range(len(authors)): subfields = [('a', '%s' % (authors[i]))] if authors_affiliations[i]: subfields.append(('v', authors_affiliations[i])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')]) if copyright_holder: record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)]) if not series_title: series_title = publication_title subfields = [] if series_title: subfields.append(('p', series_title)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if page_range: subfields.append(('c', page_range)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'BookChapter')]) if fulltext: record_add_field(rec, 'FFT', subfields=[('a', fulltext), ('t', 'Springer'), ('d', 'Fulltext')]) recordString = record_xml_output(rec) #removes whitespace except spaces recordString = re.sub(r'[\n\t\r\f\v]', '', recordString) #removes two or more consecutive spaces recordString = re.sub(r' {2,}', '', recordString) record = parseString(recordString) references = [] ref_fields = [] references_container = self.content.find('div', attrs={'id': 'abstract-references'}) if references_container: references = references_container.findAll('li') for reference in references: ref = xml_to_text(parseString(reference.decode())) #removes the space between hep-th/ and the identifier ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref) ref = extract_references_from_string_xml(ref) ref = parseString(ref) for field in ref.childNodes: for subfield in field.getElementsByTagName('subfield'): if subfield.getAttribute('code') == 'm': text = subfield.firstChild.data text = re.sub(r'\[?arXiv:', '', text) text = text.replace('CrossRef', '') if text.startswith(': '): text = text[2:] if text: subfield.firstChild.data = text else: parentNode = subfield.parentNode parentNode.removeChild(subfield) ref_fields.append(field.firstChild) for field in ref_fields: record.firstChild.appendChild(field) return record.firstChild
def _add_references(self, xml_doc, rec, refextract_callback=None): for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link, isjournal, comment, journal, publisher,\ editors, book_title in self.get_references(xml_doc): subfields = [] if textref and not authors: textref = textref.replace('\"', '\'') if refextract_callback: ref_xml = refextract_callback(textref) dom = xml.dom.minidom.parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 'r': data = fix_dashes(data) subfields.append((code, data)) if fields: subfields.append(('9', 'refextract')) else: subfields.append(('m', textref)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if ext_link: ext_link = fix_dashes(ext_link) subfields.append(('r', ext_link)) if title: subfields.append(('t', title)) elif textref: subfields.append(('m', textref)) if publisher: subfields.append(('p', publisher)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if comment: subfields.append(('m', comment)) for editor in editors: subfields.append(('e', editor)) if book_title: subfields.append(('q', book_title)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if journal: journal, vol = fix_journal_name(journal, self.journal_mappings) volume = vol + volume if volume and page: journal = journal + "," + volume + "," + page subfields.append(('s', journal)) elif volume: journal = journal + "," + volume subfields.append(('s', journal)) else: subfields.append(('s', journal)) if textref: subfields.append(('m', textref)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)