Python collapse_initials Examples, harvestingkit.utils.collapse_initials Python Examples

Example #1

0

Show file

 def test_collapse_initials(self):
     """Test proper initial handling."""
     self.assertEqual(collapse_initials("T. A. Adams"), "T.A. Adams")
     self.assertEqual(collapse_initials("T.-A. Adams"), "T.A. Adams")
     self.assertEqual(collapse_initials("T.   A. Adams"), "T.A. Adams")
     self.assertEqual(collapse_initials("T. A."), "T.A.")
     self.assertEqual(collapse_initials("T. A. V. Adams"), "T.A.V. Adams")

Example #2

0

Show file

File: aps_package.py Project: ksachs/harvesting-kit

 def _get_authors(self):
     authors = []
     affiliations = {}
     for tag in self.document.getElementsByTagName('aff'):
         aid = tag.getAttribute('id')
         affiliation = xml_to_text(tag)
         affiliation = ' '.join(affiliation.split()[1:])
         affiliations[aid] = affiliation
     for tag in self.document.getElementsByTagName('contrib'):
         if tag.getAttribute('contrib-type') == 'author':
             rid = ''
             for aff in tag.getElementsByTagName('xref'):
                 if aff.getAttribute('ref-type') == 'aff':
                     rid = aff.getAttribute('rid')
                 if len(rid.split()) > 1:
                     rid = rid.split()[0]
             given_names = get_value_in_tag(tag, 'given-names')
             given_names = collapse_initials(given_names)
             surname = get_value_in_tag(tag, 'surname')
             name = "%s, %s" % (surname, given_names)
             try:
                 authors.append((name, affiliations[rid]))
             except KeyError:
                 authors.append((name, ''))
     return authors

Example #3

0

Show file

 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link, authors, year, source,
                volume, page)

Example #4

0

Show file

File: jats_package.py Project: Dziolas/harvesting-kit

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         # Springer puts colaborations in additional "contrib" tag so to
         # avoid having fake author with all affiliations we skip "contrib"
         # tag with "contrib" subtags.
         if contrib.getElementsByTagName('contrib'):
             continue
         if contrib.getElementsByTagName('collab'):
             continue
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             corresp = []
             for tag in contrib.getElementsByTagName('xref'):
                 if tag.getAttribute('ref-type') == 'aff':
                     for rid in tag.getAttribute('rid').split():
                         if rid.lower().startswith('a'):
                             affiliations.append(rid)
                         elif rid.lower().startswith('n'):
                             corresp.append(rid)
                 elif tag.getAttribute('ref-type') == 'corresp' or\
                         tag.getAttribute('ref-type') == 'author-notes':
                     for rid in tag.getAttribute('rid').split():
                         corresp.append(rid)
             authors.append((name, affiliations, corresp))
     return authors

Example #5

0

Show file

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         # Springer puts colaborations in additional "contrib" tag so to
         # avoid having fake author with all affiliations we skip "contrib"
         # tag with "contrib" subtags.
         if contrib.getElementsByTagName('contrib'):
             continue
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             corresp = []
             for tag in contrib.getElementsByTagName('xref'):
                 if tag.getAttribute('ref-type') == 'aff':
                     for rid in tag.getAttribute('rid').split():
                         if rid.lower().startswith('a'):
                             affiliations.append(rid)
                         elif rid.lower().startswith('n'):
                             corresp.append(rid)
                 elif tag.getAttribute('ref-type') == 'corresp' or\
                         tag.getAttribute('ref-type') == 'author-notes':
                     for rid in tag.getAttribute('rid').split():
                         corresp.append(rid)
             authors.append((name, affiliations, corresp))
     return authors

Example #6

0

Show file

File: edpsciences_package.py Project: GiorgosPa/harvesting-kit

 def _get_references(self):
     for ref in self.document.getElementsByTagName("ref"):
         label = ref.getAttribute("id")
         label = sub(r"\D", "", label)
         text_ref = ""
         ext_link = ""
         for mixed in ref.getElementsByTagName("mixed-citation"):
             ref_type = mixed.getAttribute("publication-type")
             if ref_type == "thesis":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "conf-proc":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "other" or ref_type == "web":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
                 ext_link = get_value_in_tag(mixed, "ext-link")
             elif ref_type == "book":
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName("string-name"):
             surname = get_value_in_tag(auth, "surname")
             given_names = get_value_in_tag(auth, "given-names")
             given_names = collapse_initials(given_names)
             authors.append("%s, %s" % (surname, given_names))
         year = get_value_in_tag(ref, "year")
         source = get_value_in_tag(ref, "source")
         volume = get_value_in_tag(ref, "volume")
         page = get_value_in_tag(ref, "fpage")
         if ref_type == "journal":
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page

Example #7

0

Show file

File: edpsciences_package.py Project: kaplun/harvesting-kit

 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link,
                authors, year, source, volume, page)

Example #8

0

Show file

File: pos_package.py Project: GiorgosPa/harvesting-kit

 def _get_authors(self):
     authors = []
     for tag in self.document.getElementsByTagName('dc:creator'):
         author = xml_to_text(tag)
         lastname = author.split()[-1]
         givenames = author.split()[:-1]
         lastname, givenames = fix_name_capitalization(lastname, givenames)
         givenames = collapse_initials(givenames)
         authors.append("%s, %s" % (lastname, givenames))
     return authors

Example #9

0

Show file

File: pos_package.py Project: ksachs/harvesting-kit

 def _get_authors(self):
     authors = []
     for tag in self.document.getElementsByTagName('dc:creator'):
         author = xml_to_text(tag)
         lastname = author.split()[-1]
         lastname = lastname[0] + lastname[1:].lower()
         givennames = ''
         for name in author.split()[:-1]:
             name = name[0] + name[1:].lower()
             givennames += name + ' '
         givennames = collapse_initials(givennames.strip())
         authors.append("%s, %s" % (lastname, givennames))
     return authors

Example #10

0

Show file

File: pos_package.py Project: Dziolas/harvesting-kit

 def _get_authors(self):
     authors = []
     for pextag in self.document.getElementsByTagName('pex-dc:creator'):
         affiliations = []
         for auttag in pextag.getElementsByTagName('pex-dc:name'):
             author = xml_to_text(auttag)
             lastname = author.split()[-1]
             givenames = " ".join(author.split()[:-1])
             givenames = collapse_initials(givenames)
             name = "%s, %s" % (lastname, givenames)
             name = safe_title(name)
             for afftag in pextag.getElementsByTagName('pex-dc:affiliation'):
                 affiliations.append(xml_to_text(afftag))
             authors.append((name, affiliations))
     return authors

Example #11

0

Show file

 def _get_authors(self):
     authors = []
     for pextag in self.document.getElementsByTagName('pex-dc:creator'):
         affiliations = []
         for auttag in pextag.getElementsByTagName('pex-dc:name'):
             author = xml_to_text(auttag)
             lastname = author.split()[-1]
             givenames = " ".join(author.split()[:-1])
             givenames = collapse_initials(givenames)
             name = "%s, %s" % (lastname, givenames)
             name = safe_title(name)
             for afftag in pextag.getElementsByTagName(
                     'pex-dc:affiliation'):
                 if afftag:
                     affiliations.append(xml_to_text(afftag))
             authors.append((name, affiliations))
     return authors

Example #12

0

Show file

File: pos_package.py Project: Dziolas/scoap3_old

 def _get_authors(self):
     authors = []
     for tag in self.document.getElementsByTagName('dc:creator'):
         author = xml_to_text(tag)
         lastname = author.split()[-1]
         if '-' in lastname:
             names = lastname.split('-')
             names = map(lambda a: a[0] + a[1:].lower(), names)
             lastname = '-'.join(names)
         else:
             lastname = lastname[0] + lastname[1:].lower()
         givennames = ''
         for name in author.split()[:-1]:
             name = name[0] + name[1:].lower()
             givennames += name + ' '
         givennames = collapse_initials(givennames.strip())
         authors.append("%s, %s" % (lastname, givennames))
     return authors

Example #13

0

Show file

File: world_scientific_package.py Project: kaplun/harvesting-kit

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             surname, given_names = fix_name_capitalization(
                 surname, given_names.split()
             )
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             authors.append((name, affiliations, emails))
     return authors

Example #14

0

Show file

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             name = safe_title(name)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             collaborations = []
             for collaboration in contrib.getElementsByTagName("collab"):
                 collaborations.append(xml_to_text(collaboration))
             authors.append((name, affiliations, emails, collaborations))
     return authors

Example #15

0

Show file

File: world_scientific_package.py Project: fschwenn/harvesting-kit

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             name = safe_title(name)
             affiliations = []
             for aff in contrib.getElementsByTagName('aff'):
                 affiliations.append(xml_to_text(aff))
             emails = []
             for email in contrib.getElementsByTagName('email'):
                 emails.append(xml_to_text(email))
             collaborations = []
             for collaboration in contrib.getElementsByTagName("collab"):
                 collaborations.append(xml_to_text(collaboration))
             authors.append((name, affiliations, emails, collaborations))
     return authors

Example #16

0

Show file

File: jats_package.py Project: Dziolas/scoap3_old

 def _get_authors(self):
     authors = []
     for contrib in self.document.getElementsByTagName('contrib'):
         if contrib.getAttribute('contrib-type') == 'author':
             surname = get_value_in_tag(contrib, 'surname')
             given_names = get_value_in_tag(contrib, 'given-names')
             given_names = collapse_initials(given_names)
             name = '%s, %s' % (surname, given_names)
             affiliations = []
             corresp = []
             for tag in contrib.getElementsByTagName('xref'):
                 if tag.getAttribute('ref-type') == 'aff':
                     for rid in tag.getAttribute('rid').split():
                         if rid.lower().startswith('a'):
                             affiliations.append(rid)
                         elif rid.lower().startswith('n'):
                             corresp.append(rid)
                 elif tag.getAttribute('ref-type') == 'corresp' or\
                         tag.getAttribute('ref-type') == 'author-notes':
                     for rid in tag.getAttribute('rid').split():
                         corresp.append(rid)
             authors.append((name, affiliations, corresp))
     return authors

Example #17

0

Show file

File: utils_tests.py Project: kaplun/harvesting-kit

 def test_collapse_initials(self):
     self.assertEqual(collapse_initials("T. A. Adams"), "T.A. Adams")
     self.assertEqual(collapse_initials("T.   A. Adams"), "T.A. Adams")
     self.assertEqual(collapse_initials("T. A. V. Adams"), "T.A.V. Adams")

Example #18

0

Show file

    def _get_record(self, link):
        link = link.find('a')['href']
        url = urlparse.urljoin(self.base_url, link)
        page = urllib2.urlopen(url)
        page = BeautifulSoup(page)
        self.content = page.body.find('div', attrs={'id': 'content'})

        publication_title = self.content.find('div',
                                              {'id': 'publication-title'})
        if publication_title:
            publication_title = publication_title.find('a').text
        else:
            publication_title = ''
        series_title = self._find('a', {'id': 'series-title'})
        if series_title == 'NATO Science Series':
            series_title = 'NATO Sci.Ser.'
        title = self._find('h1', {'id': 'title'})
        if not title:
            title = self._find('h1', {'class': 'ChapterTitle'})
        volume = self._find('span', {'id': 'book-volume'})
        if volume:
            volume = re.sub(r'\D', '', volume)
        else:
            volume = self._find('span', {'id': 'volume-range'})
            volume = re.sub(r'\D', '', volume)
        issue = self._find('a', {'id': 'issue-range'})
        if issue:
            issue = issue.split()[1]
        year = self._find('span', {'id': 'copyright-year'})
        if not year:
            year = self._find(
                'dd', {'id': 'abstract-about-book-chapter-copyright-year'})
        year = re.sub(r'\D', '', year)
        if not year:
            year = self._find('dd', {'id': 'abstract-about-cover-date'})
            year = re.sub(r'\D', '', year)[:4]
        abstract = self._find('div', {'class': 'abstract-content formatted'})
        page_range = self._find('span', {'id': 'page-range'})
        if not page_range:
            page_range = self._find(
                'dd', {'id': 'abstract-about-book-chapter-page-ranges'})
        if page_range:
            page_range = page_range.replace('pp', '').strip()
        #publisher = self._find('dd', {'id': 'abstract-about-publisher'})
        copyright_holder = self._find(
            'dd', {'id': 'abstract-about-book-copyright-holder'})
        #issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'})
        doi = self._find('dd', {'class': 'doi'})
        #subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'})
        #online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'})
        #print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'})
        editors = []
        editors_affiliations = []
        for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}):
            editors.append(editor.find('a').text)
            try:
                editors_affiliations.append(editor.find('sup')['title'])
            except KeyError:
                editors_affiliations.append('')
            except TypeError:
                editors_affiliations.append('')
        authors = []
        authors_affiliations = []
        summary = self.content.find('div', attrs={'class': 'summary'})
        for author in summary.findAll('li', attrs={'itemprop': 'author'}):
            author_name = author.find('a').text
            author_names = []
            author_names.append(author_name.split()[-1] + ",")
            author_names += author_name.split()[:-1]
            author_name = " ".join(author_names)
            author_name = collapse_initials(author_name)
            authors.append(author_name)
            try:
                authors_affiliations.append(author.find('sup')['title'])
            except KeyError:
                authors_affiliations.append('')
            except TypeError:
                authors_affiliations.append('')
        try:
            attrs = {'id': 'abstract-actions-download-chapter-pdf-link'}
            fulltext = self.content.find('a', attrs=attrs)
            fulltext = urlparse.urljoin(self.base_url, fulltext['href'])
        except TypeError:
            fulltext = ''

        #create Marc record
        rec = create_record()
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        first_author = True
        for i in range(len(authors)):
            subfields = [('a', '%s' % (authors[i]))]
            if authors_affiliations[i]:
                subfields.append(('v', authors_affiliations[i]))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'Springer')])
        if copyright_holder:
            record_add_field(rec,
                             '542',
                             subfields=[('f', copyright_holder), ('g', year)])
        if not series_title:
            series_title = publication_title

        subfields = []
        if series_title:
            subfields.append(('p', series_title))
        if volume:
            subfields.append(('v', volume))
        if issue:
            subfields.append(('n', issue))
        if page_range:
            subfields.append(('c', page_range))
        if year:
            subfields.append(('y', year))

        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        record_add_field(rec, '980', subfields=[('a', 'BookChapter')])

        if fulltext:
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', fulltext), ('t', 'Springer'),
                                        ('d', 'Fulltext')])

        recordString = record_xml_output(rec)
        #removes whitespace except spaces
        recordString = re.sub(r'[\n\t\r\f\v]', '', recordString)
        #removes two or more consecutive spaces
        recordString = re.sub(r' {2,}', '', recordString)
        record = parseString(recordString)

        references = []
        ref_fields = []
        references_container = self.content.find(
            'div', attrs={'id': 'abstract-references'})
        if references_container:
            references = references_container.findAll('li')
            for reference in references:
                try:
                    from invenio.refextract_api import (
                        extract_references_from_string_xml)
                    ref = xml_to_text(parseString(reference.decode()))
                    #removes the space between hep-th/ and the identifier
                    ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref)
                    ref = extract_references_from_string_xml(ref)
                    ref = parseString(ref)
                    for field in ref.childNodes:
                        for subfield in field.getElementsByTagName('subfield'):
                            if subfield.getAttribute('code') == 'm':
                                text = subfield.firstChild.data
                                text = re.sub(r'\[?arXiv:', '', text)
                                text = text.replace('CrossRef', '')
                                if text.startswith(': '):
                                    text = text[2:]
                                if text:
                                    subfield.firstChild.data = text
                                else:
                                    parentNode = subfield.parentNode
                                    parentNode.removeChild(subfield)
                        ref_fields.append(field.firstChild)
                except ImportError:
                    record_add_field(rec,
                                     '999',
                                     ind1='C',
                                     ind2='5',
                                     subfields=[('m', reference.decode())])
            for field in ref_fields:
                record.firstChild.appendChild(field)
        return record.firstChild

Example #19

0

Show file

File: springer_crawler.py Project: ksachs/harvesting-kit

    def _get_record(self, link):
        link = link.find('a')['href']
        url = urlparse.urljoin(self.base_url, link)
        page = urllib2.urlopen(url)
        page = BeautifulSoup(page)
        self.content = page.body.find('div', attrs={'id': 'content'})

        publication_title = self.content.find('div', {'id': 'publication-title'})
        if publication_title:
            publication_title = publication_title.find('a').text
        else:
            publication_title = ''
        series_title = self._find('a', {'id': 'series-title'})
        if series_title == 'NATO Science Series':
            series_title = 'NATO Sci.Ser.'
        title = self._find('h1', {'id': 'title'})
        volume = self._find('span', {'id': 'book-volume'})
        if volume:
            volume = re.sub(r'\D', '', volume)
        else:
            volume = self._find('span', {'id': 'volume-range'})
            volume = re.sub(r'\D', '', volume)
        issue = self._find('a', {'id': 'issue-range'})
        if issue:
            issue = issue.split()[1]
        year = self._find('span', {'id': 'copyright-year'})
        year = re.sub(r'\D', '', year)
        if not year:
            year = self._find('dd', {'id': 'abstract-about-cover-date'})
            year = re.sub(r'\D', '', year)[:4]
        abstract = self._find('div', {'class': 'abstract-content formatted'})
        page_range = self._find('span', {'id': 'page-range'})
        if page_range:
            page_range = page_range.replace('pp', '').strip()
        publisher = self._find('dd', {'id': 'abstract-about-publisher'})
        copyright_holder = self._find('dd', {'id': 'abstract-about-book-copyright-holder'})
        issn = self._find('dd', {'id': 'abstract-about-book-series-print-issn'})
        doi = self._find('dd', {'class': 'doi'})
        subtitle = self._find('dd', {'id': 'abstract-about-book-series-subtitle'})
        online_isbn = self._find('dd', {'id': 'abstract-about-book-online-isbn'})
        print_isbn = self._find('dd', {'id': 'abstract-about-book-print-isbn'})
        editors = []
        editors_affiliations = []
        for editor in self.content.findAll('li', attrs={'itemprop': 'editor'}):
            editors.append(editor.find('a').text)
            try:
                editors_affiliations.append(editor.find('sup')['title'])
            except KeyError:
                editors_affiliations.append('')
            except TypeError:
                editors_affiliations.append('')
        authors = []
        authors_affiliations = []
        summary = self.content.find('div', attrs={'class': 'summary'})
        for author in summary.findAll('li', attrs={'itemprop': 'author'}):
            author_name = author.find('a').text
            author_names = []
            author_names.append(author_name.split()[-1] + ",")
            author_names += author_name.split()[:-1]
            author_name = " ".join(author_names)
            author_name = collapse_initials(author_name)
            authors.append(author_name)
            try:
                authors_affiliations.append(author.find('sup')['title'])
            except KeyError:
                authors_affiliations.append('')
            except TypeError:
                authors_affiliations.append('')
        try:
            attrs = {'id': 'abstract-actions-download-chapter-pdf-link'}
            fulltext = self.content.find('a', attrs=attrs)
            fulltext = urlparse.urljoin(self.base_url, fulltext['href'])
        except TypeError:
            fulltext = ''

        #create marc record
        rec = {}
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        first_author = True
        for i in range(len(authors)):
            subfields = [('a', '%s' % (authors[i]))]
            if authors_affiliations[i]:
                subfields.append(('v', authors_affiliations[i]))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'Springer')])
        if copyright_holder:
            record_add_field(rec, '542', subfields=[('f', copyright_holder), ('g', year)])
        if not series_title:
            series_title = publication_title

        subfields = []
        if series_title:
            subfields.append(('p', series_title))
        if volume:
            subfields.append(('v', volume))
        if issue:
            subfields.append(('n', issue))
        if page_range:
            subfields.append(('c', page_range))
        if year:
            subfields.append(('y', year))

        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        record_add_field(rec, '980', subfields=[('a', 'BookChapter')])

        if fulltext:
            record_add_field(rec, 'FFT', subfields=[('a', fulltext),
                                                    ('t', 'Springer'),
                                                    ('d', 'Fulltext')])

        recordString = record_xml_output(rec)
        #removes whitespaces except spaces
        recordString = re.sub(r'[\n\t\r\f\v]', '', recordString)
        #removes two or more consecutive spaces
        recordString = re.sub(r' {2,}', '', recordString)
        record = parseString(recordString)

        references = []
        ref_fields = []
        references_container = self.content.find('div', attrs={'id': 'abstract-references'})
        if references_container:
            references = references_container.findAll('li')
            for reference in references:
                ref = xml_to_text(parseString(reference.decode()))
                #removes the space between hep-th/ and the identifier
                ref = re.sub(r'hep-th/\s(\d*)', r'hep-th/\1', ref)
                ref = extract_references_from_string_xml(ref)
                ref = parseString(ref)
                for field in ref.childNodes:
                    for subfield in field.getElementsByTagName('subfield'):
                        if subfield.getAttribute('code') == 'm':
                            text = subfield.firstChild.data
                            text = re.sub(r'\[?arXiv:', '', text)
                            text = text.replace('CrossRef', '')
                            if text.startswith(': '):
                                text = text[2:]
                            if text:
                                subfield.firstChild.data = text
                            else:
                                parentNode = subfield.parentNode
                                parentNode.removeChild(subfield)
                    ref_fields.append(field.firstChild)
            for field in ref_fields:
                record.firstChild.appendChild(field)
        return record.firstChild