def test_collapse_initials(self): """Test proper initial handling.""" self.assertEqual(collapse_initials("T. A. Adams"), "T.A. Adams") self.assertEqual(collapse_initials("T.-A. Adams"), "T.A. Adams") self.assertEqual(collapse_initials("T. A. Adams"), "T.A. Adams") self.assertEqual(collapse_initials("T. A."), "T.A.") self.assertEqual(collapse_initials("T. A. V. Adams"), "T.A.V. Adams")
def _get_authors(self): authors = [] affiliations = {} for tag in self.document.getElementsByTagName('aff'): aid = tag.getAttribute('id') affiliation = xml_to_text(tag) affiliation = ' '.join(affiliation.split()[1:]) affiliations[aid] = affiliation for tag in self.document.getElementsByTagName('contrib'): if tag.getAttribute('contrib-type') == 'author': rid = '' for aff in tag.getElementsByTagName('xref'): if aff.getAttribute('ref-type') == 'aff': rid = aff.getAttribute('rid') if len(rid.split()) > 1: rid = rid.split()[0] given_names = get_value_in_tag(tag, 'given-names') given_names = collapse_initials(given_names) surname = get_value_in_tag(tag, 'surname') name = "%s, %s" % (surname, given_names) try: authors.append((name, affiliations[rid])) except KeyError: authors.append((name, '')) return authors
def _get_references(self): for ref in self.document.getElementsByTagName('ref'): label = ref.getAttribute('id') label = sub(r'\D', '', label) text_ref = '' ext_link = '' for mixed in ref.getElementsByTagName('mixed-citation'): ref_type = mixed.getAttribute('publication-type') if ref_type == 'thesis': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'conf-proc': text_ref = get_value_in_tag(ref, 'mixed-citation') elif ref_type == 'other' or ref_type == 'web': text_ref = get_value_in_tag(ref, 'mixed-citation') ext_link = get_value_in_tag(mixed, 'ext-link') elif ref_type == 'book': text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName('string-name'): surname = get_value_in_tag(auth, 'surname') given_names = get_value_in_tag(auth, 'given-names') given_names = collapse_initials(given_names) authors.append('%s, %s' % (surname, given_names)) year = get_value_in_tag(ref, 'year') source = get_value_in_tag(ref, 'source') volume = get_value_in_tag(ref, 'volume') page = get_value_in_tag(ref, 'fpage') if ref_type == 'journal': source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield (label, ref_type, text_ref, ext_link, authors, year, source, volume, page)
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): # Springer puts colaborations in additional "contrib" tag so to # avoid having fake author with all affiliations we skip "contrib" # tag with "contrib" subtags. if contrib.getElementsByTagName('contrib'): continue if contrib.getElementsByTagName('collab'): continue if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) affiliations = [] corresp = [] for tag in contrib.getElementsByTagName('xref'): if tag.getAttribute('ref-type') == 'aff': for rid in tag.getAttribute('rid').split(): if rid.lower().startswith('a'): affiliations.append(rid) elif rid.lower().startswith('n'): corresp.append(rid) elif tag.getAttribute('ref-type') == 'corresp' or\ tag.getAttribute('ref-type') == 'author-notes': for rid in tag.getAttribute('rid').split(): corresp.append(rid) authors.append((name, affiliations, corresp)) return authors
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): # Springer puts colaborations in additional "contrib" tag so to # avoid having fake author with all affiliations we skip "contrib" # tag with "contrib" subtags. if contrib.getElementsByTagName('contrib'): continue if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) affiliations = [] corresp = [] for tag in contrib.getElementsByTagName('xref'): if tag.getAttribute('ref-type') == 'aff': for rid in tag.getAttribute('rid').split(): if rid.lower().startswith('a'): affiliations.append(rid) elif rid.lower().startswith('n'): corresp.append(rid) elif tag.getAttribute('ref-type') == 'corresp' or\ tag.getAttribute('ref-type') == 'author-notes': for rid in tag.getAttribute('rid').split(): corresp.append(rid) authors.append((name, affiliations, corresp)) return authors
def _get_references(self): for ref in self.document.getElementsByTagName("ref"): label = ref.getAttribute("id") label = sub(r"\D", "", label) text_ref = "" ext_link = "" for mixed in ref.getElementsByTagName("mixed-citation"): ref_type = mixed.getAttribute("publication-type") if ref_type == "thesis": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "conf-proc": text_ref = get_value_in_tag(ref, "mixed-citation") elif ref_type == "other" or ref_type == "web": text_ref = get_value_in_tag(ref, "mixed-citation") ext_link = get_value_in_tag(mixed, "ext-link") elif ref_type == "book": text_ref = xml_to_text(mixed) authors = [] for auth in ref.getElementsByTagName("string-name"): surname = get_value_in_tag(auth, "surname") given_names = get_value_in_tag(auth, "given-names") given_names = collapse_initials(given_names) authors.append("%s, %s" % (surname, given_names)) year = get_value_in_tag(ref, "year") source = get_value_in_tag(ref, "source") volume = get_value_in_tag(ref, "volume") page = get_value_in_tag(ref, "fpage") if ref_type == "journal": source, vol = fix_journal_name(source, self.journal_mappings) if vol: volume = vol + volume yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page
def _get_authors(self): authors = [] for tag in self.document.getElementsByTagName('dc:creator'): author = xml_to_text(tag) lastname = author.split()[-1] givenames = author.split()[:-1] lastname, givenames = fix_name_capitalization(lastname, givenames) givenames = collapse_initials(givenames) authors.append("%s, %s" % (lastname, givenames)) return authors
def _get_authors(self): authors = [] for tag in self.document.getElementsByTagName('dc:creator'): author = xml_to_text(tag) lastname = author.split()[-1] lastname = lastname[0] + lastname[1:].lower() givennames = '' for name in author.split()[:-1]: name = name[0] + name[1:].lower() givennames += name + ' ' givennames = collapse_initials(givennames.strip()) authors.append("%s, %s" % (lastname, givennames)) return authors
def _get_authors(self): authors = [] for pextag in self.document.getElementsByTagName('pex-dc:creator'): affiliations = [] for auttag in pextag.getElementsByTagName('pex-dc:name'): author = xml_to_text(auttag) lastname = author.split()[-1] givenames = " ".join(author.split()[:-1]) givenames = collapse_initials(givenames) name = "%s, %s" % (lastname, givenames) name = safe_title(name) for afftag in pextag.getElementsByTagName('pex-dc:affiliation'): affiliations.append(xml_to_text(afftag)) authors.append((name, affiliations)) return authors
def _get_authors(self): authors = [] for pextag in self.document.getElementsByTagName('pex-dc:creator'): affiliations = [] for auttag in pextag.getElementsByTagName('pex-dc:name'): author = xml_to_text(auttag) lastname = author.split()[-1] givenames = " ".join(author.split()[:-1]) givenames = collapse_initials(givenames) name = "%s, %s" % (lastname, givenames) name = safe_title(name) for afftag in pextag.getElementsByTagName( 'pex-dc:affiliation'): if afftag: affiliations.append(xml_to_text(afftag)) authors.append((name, affiliations)) return authors
def _get_authors(self): authors = [] for tag in self.document.getElementsByTagName('dc:creator'): author = xml_to_text(tag) lastname = author.split()[-1] if '-' in lastname: names = lastname.split('-') names = map(lambda a: a[0] + a[1:].lower(), names) lastname = '-'.join(names) else: lastname = lastname[0] + lastname[1:].lower() givennames = '' for name in author.split()[:-1]: name = name[0] + name[1:].lower() givennames += name + ' ' givennames = collapse_initials(givennames.strip()) authors.append("%s, %s" % (lastname, givennames)) return authors
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) surname, given_names = fix_name_capitalization( surname, given_names.split() ) name = '%s, %s' % (surname, given_names) affiliations = [] for aff in contrib.getElementsByTagName('aff'): affiliations.append(xml_to_text(aff)) emails = [] for email in contrib.getElementsByTagName('email'): emails.append(xml_to_text(email)) authors.append((name, affiliations, emails)) return authors
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) name = safe_title(name) affiliations = [] for aff in contrib.getElementsByTagName('aff'): affiliations.append(xml_to_text(aff)) emails = [] for email in contrib.getElementsByTagName('email'): emails.append(xml_to_text(email)) collaborations = [] for collaboration in contrib.getElementsByTagName("collab"): collaborations.append(xml_to_text(collaboration)) authors.append((name, affiliations, emails, collaborations)) return authors
def _get_authors(self): authors = [] for contrib in self.document.getElementsByTagName('contrib'): if contrib.getAttribute('contrib-type') == 'author': surname = get_value_in_tag(contrib, 'surname') given_names = get_value_in_tag(contrib, 'given-names') given_names = collapse_initials(given_names) name = '%s, %s' % (surname, given_names) affiliations = [] corresp = [] for tag in contrib.getElementsByTagName('xref'): if tag.getAttribute('ref-type') == 'aff': for rid in tag.getAttribute('rid').split(): if rid.lower().startswith('a'): affiliations.append(rid) elif rid.lower().startswith('n'): corresp.append(rid) elif tag.getAttribute('ref-type') == 'corresp' or\ tag.getAttribute('ref-type') == 'author-notes': for rid in tag.getAttribute('rid').split(): corresp.append(rid) authors.append((name, affiliations, corresp)) return authors
def test_collapse_initials(self): self.assertEqual(collapse_initials("T. A. Adams"), "T.A. Adams") self.assertEqual(collapse_initials("T. A. Adams"), "T.A. Adams") self.assertEqual(collapse_initials("T. A. V. Adams"), "T.A.V. Adams")
def _get_record(self, link): link = link.find('a')['href'] url = urlparse.urljoin(self.base_url, link) page = urllib2.urlopen(url) page = BeautifulSoup(page) self.content = page.body.find('div', attrs={'id': 'content'}) publication_title = self.content.find('div', {'id': 'publication-title'}) if publication_title: publication_title = publication_title.find('a').text else: publication_title = '' series_title = self._find('a', {'id': 'series-title'}) if series_title == 'NATO Science Series': series_title = 'NATO Sci.Ser.' title = self._find('h1', {'id': 'title'}) if not title: title = self._find('h1', {'class': 'ChapterTitle'}) volume = self._find('span', {'id': 'book-volume'}) if volume: volume = re.sub(r'\D', '', volume) else: volume = self._find('span', {'id': 'volume-range'}) volume = re.sub(r'\D', '', volume) issue = self._find('a', {'id': 'issue-range'}) if issue: issue = issue.split()[1] year = self._find('span', {'id': 'copyright-year'}) if not year: year = self._find( 'dd', {'id': 'abstract-about-book-chapter-copyright-year'}) year = re.sub(r'\D', '', year) if not year: year = self._find('dd', {'id': 'abstract-about-cover-date'}) year = re.sub(r'\D', '', year)[:4] abstract = self._find('div', {'class': 'abstract-content formatted'}) page_range = self._find('span', {'id': 'page-range'}) if not page_range: page_range = self._find( 'dd', {'id': 'abstract-about-book-chapter-page-ranges'}) if page_range: page_range = page_range.replace('pp', '').strip() #publisher = self._find('dd', {'id': 'abstract-about-publisher'}) copyright_holder = self._find( 'dd', {'id': 'abstract-about-book-copyright-holder'}) #issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'}) doi = self._find('dd', {'class': 'doi'}) #subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'}) #online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'}) #print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'}) editors = [] editors_affiliations = [] for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}): editors.append(editor.find('a').text) try: editors_affiliations.append(editor.find('sup')['title']) except KeyError: editors_affiliations.append('') except TypeError: editors_affiliations.append('') authors = [] authors_affiliations = [] summary = self.content.find('div', attrs={'class': 'summary'}) for author in summary.findAll('li', attrs={'itemprop': 'author'}): author_name = author.find('a').text author_names = [] author_names.append(author_name.split()[-1] + ",") author_names += author_name.split()[:-1] author_name = " ".join(author_names) author_name = collapse_initials(author_name) authors.append(author_name) try: authors_affiliations.append(author.find('sup')['title']) except KeyError: authors_affiliations.append('') except TypeError: authors_affiliations.append('') try: attrs = {'id': 'abstract-actions-download-chapter-pdf-link'} fulltext = self.content.find('a', attrs=attrs) fulltext = urlparse.urljoin(self.base_url, fulltext['href']) except TypeError: fulltext = '' #create Marc record rec = create_record() if title: record_add_field(rec, '245', subfields=[('a', title)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) first_author = True for i in range(len(authors)): subfields = [('a', '%s' % (authors[i]))] if authors_affiliations[i]: subfields.append(('v', authors_affiliations[i])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')]) if copyright_holder: record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)]) if not series_title: series_title = publication_title subfields = [] if series_title: subfields.append(('p', series_title)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if page_range: subfields.append(('c', page_range)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'BookChapter')]) if fulltext: record_add_field(rec, 'FFT', subfields=[('a', fulltext), ('t', 'Springer'), ('d', 'Fulltext')]) recordString = record_xml_output(rec) #removes whitespace except spaces recordString = re.sub(r'[\n\t\r\f\v]', '', recordString) #removes two or more consecutive spaces recordString = re.sub(r' {2,}', '', recordString) record = parseString(recordString) references = [] ref_fields = [] references_container = self.content.find( 'div', attrs={'id': 'abstract-references'}) if references_container: references = references_container.findAll('li') for reference in references: try: from invenio.refextract_api import ( extract_references_from_string_xml) ref = xml_to_text(parseString(reference.decode())) #removes the space between hep-th/ and the identifier ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref) ref = extract_references_from_string_xml(ref) ref = parseString(ref) for field in ref.childNodes: for subfield in field.getElementsByTagName('subfield'): if subfield.getAttribute('code') == 'm': text = subfield.firstChild.data text = re.sub(r'\[?arXiv:', '', text) text = text.replace('CrossRef', '') if text.startswith(': '): text = text[2:] if text: subfield.firstChild.data = text else: parentNode = subfield.parentNode parentNode.removeChild(subfield) ref_fields.append(field.firstChild) except ImportError: record_add_field(rec, '999', ind1='C', ind2='5', subfields=[('m', reference.decode())]) for field in ref_fields: record.firstChild.appendChild(field) return record.firstChild
def _get_record(self, link): link = link.find('a')['href'] url = urlparse.urljoin(self.base_url, link) page = urllib2.urlopen(url) page = BeautifulSoup(page) self.content = page.body.find('div', attrs={'id': 'content'}) publication_title = self.content.find('div', {'id': 'publication-title'}) if publication_title: publication_title = publication_title.find('a').text else: publication_title = '' series_title = self._find('a', {'id': 'series-title'}) if series_title == 'NATO Science Series': series_title = 'NATO Sci.Ser.' title = self._find('h1', {'id': 'title'}) volume = self._find('span', {'id': 'book-volume'}) if volume: volume = re.sub(r'\D', '', volume) else: volume = self._find('span', {'id': 'volume-range'}) volume = re.sub(r'\D', '', volume) issue = self._find('a', {'id': 'issue-range'}) if issue: issue = issue.split()[1] year = self._find('span', {'id': 'copyright-year'}) year = re.sub(r'\D', '', year) if not year: year = self._find('dd', {'id': 'abstract-about-cover-date'}) year = re.sub(r'\D', '', year)[:4] abstract = self._find('div', {'class': 'abstract-content formatted'}) page_range = self._find('span', {'id': 'page-range'}) if page_range: page_range = page_range.replace('pp', '').strip() publisher = self._find('dd', {'id': 'abstract-about-publisher'}) copyright_holder = self._find('dd', {'id': 'abstract-about-book-copyright-holder'}) issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'}) doi = self._find('dd', {'class': 'doi'}) subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'}) online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'}) print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'}) editors = [] editors_affiliations = [] for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}): editors.append(editor.find('a').text) try: editors_affiliations.append(editor.find('sup')['title']) except KeyError: editors_affiliations.append('') except TypeError: editors_affiliations.append('') authors = [] authors_affiliations = [] summary = self.content.find('div', attrs={'class': 'summary'}) for author in summary.findAll('li', attrs={'itemprop': 'author'}): author_name = author.find('a').text author_names = [] author_names.append(author_name.split()[-1] + ",") author_names += author_name.split()[:-1] author_name = " ".join(author_names) author_name = collapse_initials(author_name) authors.append(author_name) try: authors_affiliations.append(author.find('sup')['title']) except KeyError: authors_affiliations.append('') except TypeError: authors_affiliations.append('') try: attrs = {'id': 'abstract-actions-download-chapter-pdf-link'} fulltext = self.content.find('a', attrs=attrs) fulltext = urlparse.urljoin(self.base_url, fulltext['href']) except TypeError: fulltext = '' #create marc record rec = {} if title: record_add_field(rec, '245', subfields=[('a', title)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) first_author = True for i in range(len(authors)): subfields = [('a', '%s' % (authors[i]))] if authors_affiliations[i]: subfields.append(('v', authors_affiliations[i])) if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')]) if copyright_holder: record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)]) if not series_title: series_title = publication_title subfields = [] if series_title: subfields.append(('p', series_title)) if volume: subfields.append(('v', volume)) if issue: subfields.append(('n', issue)) if page_range: subfields.append(('c', page_range)) if year: subfields.append(('y', year)) record_add_field(rec, '773', subfields=subfields) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'BookChapter')]) if fulltext: record_add_field(rec, 'FFT', subfields=[('a', fulltext), ('t', 'Springer'), ('d', 'Fulltext')]) recordString = record_xml_output(rec) #removes whitespaces except spaces recordString = re.sub(r'[\n\t\r\f\v]', '', recordString) #removes two or more consecutive spaces recordString = re.sub(r' {2,}', '', recordString) record = parseString(recordString) references = [] ref_fields = [] references_container = self.content.find('div', attrs={'id': 'abstract-references'}) if references_container: references = references_container.findAll('li') for reference in references: ref = xml_to_text(parseString(reference.decode())) #removes the space between hep-th/ and the identifier ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref) ref = extract_references_from_string_xml(ref) ref = parseString(ref) for field in ref.childNodes: for subfield in field.getElementsByTagName('subfield'): if subfield.getAttribute('code') == 'm': text = subfield.firstChild.data text = re.sub(r'\[?arXiv:', '', text) text = text.replace('CrossRef', '') if text.startswith(': '): text = text[2:] if text: subfield.firstChild.data = text else: parentNode = subfield.parentNode parentNode.removeChild(subfield) ref_fields.append(field.firstChild) for field in ref_fields: record.firstChild.appendChild(field) return record.firstChild