def _add_references(self, rec): for label, ref_type, text_ref, ext_link, authors, year, source, volume, page in self._get_references(): subfields = [] if label: subfields.append(("o", label)) if text_ref: ref_xml = extract_references_from_string_xml(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") subfields.append((code, data)) subfields.append(("9", "refextract")) if ref_type: subfields.append(("d", ref_type)) if text_ref: subfields.append(("m", text_ref)) if ext_link: subfields.append(("u", ext_link)) for author in authors: subfields.append(("h", author)) if year: subfields.append(("y", year)) if source and volume and page: subfields.append(("s", source + "," + volume + "," + page)) elif source and volume: subfields.append(("s", source + "," + volume)) elif source and page: subfields.append(("s", source + "," + page)) elif source: subfields.append(("s", source)) record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields)
def extract_references_txt(self, req, form): """Extract references from plain text""" check_login(req) if 'txt' not in form: return 'No text specified' txt = form['txt'].value return extract_references_from_string_xml(txt)
def extract_references_txt(self, req, form): """Extract references from plain text""" check_login(req) if 'txt' not in form: return 'No text specified' txt = form['txt'].value return extract_references_from_string_xml(txt)
def extract_one(config, pdf_path): """Extract references from one file""" # the document body is not empty: # 2. If necessary, locate the reference section: if config.treat_as_reference_section: docbody = open(pdf_path).read().decode('utf-8') out = extract_references_from_string_xml(docbody) else: write_message("* processing pdffile: %s" % pdf_path, verbose=2) out = extract_references_from_file_xml(pdf_path) return out
def extract_one(config, pdf_path): """Extract references from one file""" # the document body is not empty: # 2. If necessary, locate the reference section: if config.treat_as_reference_section: docbody = open(pdf_path).read().decode('utf-8') out = extract_references_from_string_xml(docbody) else: write_message("* processing pdffile: %s" % pdf_path, verbose=2) out = extract_references_from_file_xml(pdf_path) return out
def extract_references(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) elif 'txt' in form and form['txt'].value: txt = form['txt'].value references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = self.extract_references_template() else: out = """ <style type="text/css"> #referenceinp_link { display: none; } </style> """ out += format_record(0, 'hdref', xml_record=references_xml.encode('utf-8'), user_info=user_info) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def extract_references(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) elif 'txt' in form and form['txt'].value: txt = form['txt'].value references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = self.extract_references_template() else: out = """ <style type="text/css"> #referenceinp_link { display: none; } </style> """ out += format_record(0, 'hdref', xml_record=references_xml.encode('utf-8'), user_info=user_info) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def extract(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value.strip() references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value.strip()) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value.strip() try: references_xml = extract_references_from_url_xml(url) except (FullTextNotAvailable, ConnectionError, HTTPError, Timeout): references_xml = None elif 'txt' in form and form['txt'].value: txt = form['txt'].value.decode('utf-8', 'ignore') references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = docextract_templates.tmpl_web_form() else: references_html = format_record(0, 'hdref', xml_record=references_xml, user_info=user_info) out = docextract_templates.tmpl_web_result(references_html) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml(txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def extract(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) elif 'arxiv' in form and form['arxiv'].value: url = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) elif 'txt' in form and form['txt'].value: txt = form['txt'].value.decode('utf-8', 'ignore') references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = docextract_templates.tmpl_web_form() else: references_html = format_record(0, 'hdref', xml_record=references_xml, user_info=user_info) out = docextract_templates.tmpl_web_result(references_html) # Render the page (including header, footer) return page(title='References Extractor', body=out, uid=user_info['uid'], req=req)
def extract(self, req, form): """Refrences extraction page This page can be used for authors to test their pdfs against our refrences extraction process""" user_info = collect_user_info(req) plots = None list_image_names = [] list_caption = [] plots_dir = os.path.join(CFG_PREFIX, "var/www/img/plots/") # unique folder name # Handle the 3 POST parameters if 'pdf' in form and form['pdf'].value: pdf = form['pdf'].value references_xml = extract_from_pdf_string(pdf) pdf_string = form['pdf'].file.read() pdf = safe_mkstemp('extract.pdf') f = open(pdf, 'w') f.write(pdf_string) f.close() plots = 'File pdf: ' + str(pdf) + '<br />' (exit_code, output_buffer, stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf) plotextracted_pdf_path = pdf + ".extracted/extracted.json" code, figures, extracted = merging_articles(None, plotextracted_pdf_path) id_fulltext = "" marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True) plots += marc_path + '<br />' f = open (marc_path, 'r') record_xml = f.read() f.close() #plots_dir = "/opt/invenio/var/www/img/plots/" if os.path.exists(plots_dir): shutil.rmtree(plots_dir) os.mkdir(plots_dir) re_list = REGEXP_RECORD.findall(record_xml) for r in re_list: re_subfield = REGEXP_SUBFIELD_A.findall(r) for index, image_path in enumerate(re_subfield): if index == 0: run_shell_command('cp ' + image_path + ' ' + plots_dir) elif 'arxiv' in form and form['arxiv'].value: plots = "" url_pdf = make_arxiv_url(arxiv_id=form['arxiv'].value) references_xml = extract_references_from_url_xml(url_pdf) url_tarball = make_arxiv_tar_url(arxiv_id=form['arxiv'].value) plotextracted_xml_path, plotextracted_pdf_path = extract_plots_from_latex_and_pdf(url_tarball, url_pdf) plots += 'TAR: ' + plotextracted_xml_path + '<br />' plots += 'PDF: ' + plotextracted_pdf_path + '<br />' ''' code, figures, extracted = merging_latex_pdf(plotextracted_xml_path, None, "", ) id_fulltext = "" marc_path = create_MARCXML(figures, id_fulltext, code, extracted, write_file=True) ''' dest_dir = os.path.join(CFG_TMPDIR, 'textmining') try: os.mkdir(dest_dir) except OSError: pass code, message, figures, marc_path = merging_latex_pdf(plotextracted_xml_path, "", "", dest_dir) plots += 'OUTPUT: ' + marc_path + '<br />' f = open (marc_path, 'r') record_xml = f.read() f.close() if os.path.exists(plots_dir): shutil.rmtree(plots_dir) os.mkdir(plots_dir) re_list = REGEXP_RECORD.findall(record_xml) for r in re_list: re_subfield = REGEXP_SUBFIELD_A.findall(r) re_subfield_caption = REGEXP_SUBFIELD_D.findall(r) for index, image_path in enumerate(re_subfield): if index == 0: run_shell_command('cp ' + image_path + ' ' + plots_dir) list_image_names.append(os.path.split(image_path)[1]) list_caption.append(re_subfield_caption[index]) elif 'url' in form and form['url'].value: url = form['url'].value references_xml = extract_references_from_url_xml(url) plots = "ME3" elif 'txt' in form and form['txt'].value: txt = form['txt'].value references_xml = extract_references_from_string_xml(txt) else: references_xml = None # If we have not uploaded anything yet # Display the form that allows us to do so if not references_xml: out = self.extract_references_template() else: out = """ <style type="text/css"> #referenceinp_link { display: none; } /*img.plot { width: 250px; height: 250px; }*/ </style> """ out += format_record(0, 'hdref', xml_record=references_xml.encode('utf-8'), user_info=user_info) if plots: out += "<h2>Plots</h2>" out += plots dirList = os.listdir(plots_dir) for i, fname in enumerate(dirList): out += '<h3>Figure ' + str(i+1) + '</h3> <p><img src="/img/plots/' + fname + '" class="plot"></p>' index = list_image_names.index(fname) out += '<p>' + list_caption[index] + '</p>' # Render the page (including header, footer) return page(title='Document Extractor', body=out, uid=user_info['uid'], req=req)
def _add_references(self, xml_doc, rec): if self.CONSYN: for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link, isjournal, comment, journal, publisher,\ editors, book_title in self.get_references(xml_doc): subfields = [] if textref and not authors: textref = textref.replace('\"', '\'') ref_xml = extract_references_from_string_xml(textref) dom = xml.dom.minidom.parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == 's': try: journal = data.split(',')[0] journal, vol = fix_journal_name(journal, self.journal_mappings) vol += data.split(',')[1] try: page = data.split(',')[2] journal = journal + "," + vol + "," + page subfields.append(('s', journal)) except IndexError: journal = journal + "," + vol subfields.append(('s', journal)) except IndexError: subfields.append(('s', data)) else: subfields.append((code, data)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if ext_link: subfields.append(('r', ext_link)) if title: subfields.append(('t', title)) elif textref: subfields.append(('m', textref)) if publisher: subfields.append(('p', publisher)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if comment: subfields.append(('m', comment)) for editor in editors: subfields.append(('e', editor)) if book_title: subfields.append(('q', book_title)) if label: label = re.sub("[\[\].)]", "", label) subfields.append(('o', label)) if journal: journal, vol = fix_journal_name(journal, self.journal_mappings) volume = vol + volume if volume and page: journal = journal + "," + volume + "," + page subfields.append(('s', journal)) elif volume: journal = journal + "," + volume subfields.append(('s', journal)) else: subfields.append(('s', journal)) if subfields: record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields) else: for label, authors, doi, issue, page, title, volume, year,\ textref, ext_link in self.get_references(xml_doc): subfields = [] if doi: subfields.append(('a', doi)) for author in authors: subfields.append(('h', author)) if issue: subfields.append(('n', issue)) if label: subfields.append(('o', label)) if page: subfields.append(('p', page)) if ext_link: subfields.append(('r', ext_link)) if title and volume and year and page: subfields.append( ('s', '%s %s (%s) %s' % (title, volume, year, page))) elif textref: subfields.append(('m', textref)) if title: subfields.append(('t', title)) if volume: subfields.append(('v', volume)) if year: subfields.append(('y', year)) if subfields: record_add_field( rec, '999', ind1='C', ind2='5', subfields=subfields)
def _get_record(self, link): link = link.find('a')['href'] url = urlparse.urljoin(self.base_url, link) page = urllib2.urlopen(url) page = BeautifulSoup(page) self.content = page.body.find('div', attrs={'id': 'content'}) publication_title = self.content.find('div', {'id': 'publication-title'}) if publication_title: publication_title = publication_title.find('a').text else: publication_title = '' series_title = self._find('a', {'id': 'series-title'}) if series_title == 'NATO Science Series': series_title = 'NATO Sci.Ser.' title = self._find('h1', {'id': 'title'}) volume = self._find('span', {'id': 'book-volume'}) if volume: volume = re.sub(r'\D', '', volume) else: volume = self._find('span', {'id': 'volume-range'}) volume = re.sub(r'\D', '', volume) issue = self._find('a', {'id': 'issue-range'}) if issue: issue = issue.split()[1] year = self._find('span', {'id': 'copyright-year'}) year = re.sub(r'\D', '', year) if not year: year = self._find('dd', {'id': 'abstract-about-cover-date'}) year = re.sub(r'\D', '', year)[:4] abstract = self._find('div', {'class': 'abstract-content formatted'}) page_range = self._find('span', {'id': 'page-range'}) if page_range: page_range = page_range.replace('pp', '').strip() publisher = self._find('dd', {'id': 'abstract-about-publisher'}) copyright_holder = self._find('dd', {'id': 'abstract-about-book-copyright-holder'}) issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'}) doi = self._find('dd', {'class': 'doi'}) subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'}) online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'}) print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'}) editors = [] editors_affiliations = [] for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}): editors.append(editor.find('a').text) try: editors_affiliations.append(editor.find('sup')['title']) except KeyError: editors_affiliations.append('') except TypeError: editors_affiliations.append('') authors = [] authors_affiliations = [] summary = self.content.find('div', attrs={'class': 'summary'}) for author in summary.findAll('li', attrs={'itemprop': 'author'}): author_name = author.find('a').text author_names = [] author_names.append(author_name.split()[-1] + ",") author_names += author_name.split()[:-1] author_name = " ".join(author_names) author_name = collapse_initials(author_name) authors.append(author_name) try: authors_affiliations.append(author.find('sup')['title']) except KeyError: authors_affiliations.append('') except TypeError: authors_affiliations.append('') try: attrs = {'id': 'abstract-actions-download-chapter-pdf-link'} fulltext = self.content.find('a', attrs=attrs) fulltext = urlparse.urljoin(self.base_url, fulltext['href']) except TypeError: fulltext = '' #create marc record rec = {} if title: record_add_field(rec, '245', subfields=[('a', title)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) first_author = True for i in range(len(authors)): subfields = [('a', '%s' % (authors[i]))] if authors_affiliations[i]: subfields.append(('v', authors_affiliations[i])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')]) if copyright_holder: record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)]) if not series_title: series_title = publication_title subfields = [] if series_title: subfields.append(('p', series_title)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if page_range: subfields.append(('c', page_range)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'BookChapter')]) if fulltext: record_add_field(rec, 'FFT', subfields=[('a', fulltext), ('t', 'Springer'), ('d', 'Fulltext')]) recordString = record_xml_output(rec) #removes whitespaces except spaces recordString = re.sub(r'[\n\t\r\f\v]', '', recordString) #removes two or more consecutive spaces recordString = re.sub(r' {2,}', '', recordString) record = parseString(recordString) references = [] ref_fields = [] references_container = self.content.find('div', attrs={'id': 'abstract-references'}) if references_container: references = references_container.findAll('li') for reference in references: ref = xml_to_text(parseString(reference.decode())) #removes the space between hep-th/ and the identifier ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref) ref = extract_references_from_string_xml(ref) ref = parseString(ref) for field in ref.childNodes: for subfield in field.getElementsByTagName('subfield'): if subfield.getAttribute('code') == 'm': text = subfield.firstChild.data text = re.sub(r'\[?arXiv:', '', text) text = text.replace('CrossRef', '') if text.startswith(': '): text = text[2:] if text: subfield.firstChild.data = text else: parentNode = subfield.parentNode parentNode.removeChild(subfield) ref_fields.append(field.firstChild) for field in ref_fields: record.firstChild.appendChild(field) return record.firstChild
def _get_record(self, link): link = link.find('a')['href'] url = urlparse.urljoin(self.base_url, link) page = urllib2.urlopen(url) page = BeautifulSoup(page) self.content = page.body.find('div', attrs={'id': 'content'}) publication_title = self.content.find('div', {'id': 'publication-title'}) if publication_title: publication_title = publication_title.find('a').text else: publication_title = '' series_title = self._find('a', {'id': 'series-title'}) if series_title == 'NATO Science Series': series_title = 'NATO Sci.Ser.' title = self._find('h1', {'id': 'title'}) if not title: title = self._find('h1', {'class': 'ChapterTitle'}) volume = self._find('span', {'id': 'book-volume'}) if volume: volume = re.sub(r'\D', '', volume) else: volume = self._find('span', {'id': 'volume-range'}) volume = re.sub(r'\D', '', volume) issue = self._find('a', {'id': 'issue-range'}) if issue: issue = issue.split()[1] year = self._find('span', {'id': 'copyright-year'}) if not year: year = self._find( 'dd', {'id': 'abstract-about-book-chapter-copyright-year'}) year = re.sub(r'\D', '', year) if not year: year = self._find('dd', {'id': 'abstract-about-cover-date'}) year = re.sub(r'\D', '', year)[:4] abstract = self._find('div', {'class': 'abstract-content formatted'}) page_range = self._find('span', {'id': 'page-range'}) if not page_range: page_range = self._find( 'dd', {'id': 'abstract-about-book-chapter-page-ranges'}) if page_range: page_range = page_range.replace('pp', '').strip() #publisher = self._find('dd', {'id': 'abstract-about-publisher'}) copyright_holder = self._find( 'dd', {'id': 'abstract-about-book-copyright-holder'}) #issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'}) doi = self._find('dd', {'class': 'doi'}) #subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'}) #online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'}) #print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'}) editors = [] editors_affiliations = [] for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}): editors.append(editor.find('a').text) try: editors_affiliations.append(editor.find('sup')['title']) except KeyError: editors_affiliations.append('') except TypeError: editors_affiliations.append('') authors = [] authors_affiliations = [] summary = self.content.find('div', attrs={'class': 'summary'}) for author in summary.findAll('li', attrs={'itemprop': 'author'}): author_name = author.find('a').text author_names = [] author_names.append(author_name.split()[-1] + ",") author_names += author_name.split()[:-1] author_name = " ".join(author_names) author_name = collapse_initials(author_name) authors.append(author_name) try: authors_affiliations.append(author.find('sup')['title']) except KeyError: authors_affiliations.append('') except TypeError: authors_affiliations.append('') try: attrs = {'id': 'abstract-actions-download-chapter-pdf-link'} fulltext = self.content.find('a', attrs=attrs) fulltext = urlparse.urljoin(self.base_url, fulltext['href']) except TypeError: fulltext = '' #create Marc record rec = create_record() if title: record_add_field(rec, '245', subfields=[('a', title)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) first_author = True for i in range(len(authors)): subfields = [('a', '%s' % (authors[i]))] if authors_affiliations[i]: subfields.append(('v', authors_affiliations[i])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')]) if copyright_holder: record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)]) if not series_title: series_title = publication_title subfields = [] if series_title: subfields.append(('p', series_title)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if page_range: subfields.append(('c', page_range)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'BookChapter')]) if fulltext: record_add_field(rec, 'FFT', subfields=[('a', fulltext), ('t', 'Springer'), ('d', 'Fulltext')]) recordString = record_xml_output(rec) #removes whitespace except spaces recordString = re.sub(r'[\n\t\r\f\v]', '', recordString) #removes two or more consecutive spaces recordString = re.sub(r' {2,}', '', recordString) record = parseString(recordString) references = [] ref_fields = [] references_container = self.content.find( 'div', attrs={'id': 'abstract-references'}) if references_container: references = references_container.findAll('li') for reference in references: try: from invenio.refextract_api import ( extract_references_from_string_xml) ref = xml_to_text(parseString(reference.decode())) #removes the space between hep-th/ and the identifier ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref) ref = extract_references_from_string_xml(ref) ref = parseString(ref) for field in ref.childNodes: for subfield in field.getElementsByTagName('subfield'): if subfield.getAttribute('code') == 'm': text = subfield.firstChild.data text = re.sub(r'\[?arXiv:', '', text) text = text.replace('CrossRef', '') if text.startswith(': '): text = text[2:] if text: subfield.firstChild.data = text else: parentNode = subfield.parentNode parentNode.removeChild(subfield) ref_fields.append(field.firstChild) except ImportError: record_add_field(rec, '999', ind1='C', ind2='5', subfields=[('m', reference.decode())]) for field in ref_fields: record.firstChild.appendChild(field) return record.firstChild
def get_record_rich(self, filename): """ Gets the Marc xml of the files in xaml_rich directory :param fileName: the name of the file to parse. :type fileName: string :returns: a string with the marc xml version of the file. """ self.document = parse(filename) rec = create_record() articles = self.document.getElementsByTagName("ArticleID") for article in articles: article_type = article.getAttribute("Type") if not article_type == "Article": return "" doi = get_value_in_tag(self.document, "DOI") date = "" for tag in self.document.getElementsByTagName("Accepted"): year = get_value_in_tag(tag, "Year") month = get_value_in_tag(tag, "Month").zfill(2) day = get_value_in_tag(tag, "Day").zfill(2) date = "%s-%s-%s" % (year, month, day) if not date: for tag in self.document.getElementsByTagName("OnlineDate"): year = get_value_in_tag(tag, "Year") month = get_value_in_tag(tag, "Month").zfill(2) day = get_value_in_tag(tag, "Day").zfill(2) date = "%s-%s-%s" % (year, month, day) first_page = get_value_in_tag(article, "FirstPage") last_page = get_value_in_tag(article, "LastPage") subjects = article.getElementsByTagName("Keyword") subjects = map(xml_to_text, subjects) subject = ", ".join(subjects) copyright_statement = get_value_in_tag(article, "Copyright") journal = get_value_in_tag(self.document, "JournalTitle") journal, volume = fix_journal_name(journal, self.journal_mappings) issues = self.document.getElementsByTagName("IssueID") for issue in issues: volume += get_value_in_tag(issue, "Volume") year = get_value_in_tag(issue, "Year") title = get_value_in_tag(self.document, "Title") authors = self.document.getElementsByTagName("Author") affiliations = self.document.getElementsByTagName("Affiliation") def affiliation_pair(a): return a.getAttribute("ID"), get_value_in_tag(a, "UnstructuredAffiliation") affiliations = map(affiliation_pair, affiliations) affiliations = dict(affiliations) def author_pair(a): surname = get_value_in_tag(a, "LastName") first_name = get_value_in_tag(a, "FirstName") middle_name = get_value_in_tag(a, "MiddleName") if middle_name: name = "%s, %s %s" % (surname, first_name, middle_name) else: name = "%s, %s" % (surname, first_name) try: affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label") affiliation = affiliations[affid] except IndexError: affiliation = "" except KeyError: affiliation = "" return name, affiliation authors = map(author_pair, authors) abstract = get_value_in_tag(self.document, "Abstract") references = self.document.getElementsByTagName("Bibliomixed") for reference in references: subfields = [] label = reference.getAttribute("N") if label: subfields.append(("o", label)) bibliosets = reference.getElementsByTagName("Biblioset") for tag in bibliosets: ref_year = get_value_in_tag(tag, "Date") ref_journal = get_value_in_tag(tag, "JournalShortTitle") ref_journal, ref_volume = fix_journal_name(ref_journal, self.journal_mappings) ref_volume += get_value_in_tag(tag, "Volume") ref_page = get_value_in_tag(tag, "ArtPageNums") if ref_year: subfields.append(("y", ref_year)) if ref_journal and ref_volume and ref_page: subfields.append(("s", "%s,%s,%s" % (ref_journal, ref_volume, ref_page))) reference.removeChild(tag) text_ref = xml_to_text(reference) ref_xml = extract_references_from_string_xml(text_ref) dom = parseString(ref_xml) fields = dom.getElementsByTagName("datafield")[0] fields = fields.getElementsByTagName("subfield") if fields: subfields.append(("9", "refextract")) for field in fields: data = field.firstChild.data code = field.getAttribute("code") if code == "m" and bibliosets: continue else: subfields.append((code, data)) if subfields: record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields) if title: record_add_field(rec, "245", subfields=[("a", title)]) if date: record_add_field(rec, "260", subfields=[("c", date), ("t", "published")]) if doi: record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")]) if abstract: record_add_field(rec, "520", subfields=[("a", abstract), ("9", "EDPSciences")]) first_author = True for author in authors: if first_author: subfields = [("a", author[0])] if author[1]: subfields.append(("v", author[1])) record_add_field(rec, "100", subfields=subfields) first_author = False else: subfields = [("a", author[0])] if author[1]: subfields.append(("v", author[1])) record_add_field(rec, "700", subfields=subfields) subfields = [] if journal and volume and first_page: subfields.append(("s", "%s,%s,%s" % (journal, volume, first_page))) if first_page and last_page: try: nuber_of_pages = int(last_page) - int(first_page) record_add_field(rec, "300", subfields=[("a", str(nuber_of_pages))]) except ValueError: pass subfields.append(("c", "%s-%s" % (first_page, last_page))) if year: subfields.append(("y", year)) record_add_field(rec, "773", subfields=subfields) record_add_field(rec, "980", subfields=[("a", "HEP")]) if copyright_statement: record_add_field(rec, "542", subfields=[("f", copyright_statement)]) if subject: record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "EDPSciences"), ("a", subject)]) try: return record_xml_output(rec) except UnicodeDecodeError: message = "Found a bad char in the file for the article " + doi sys.stderr.write(message) return ""