Example #1
0
 def test_fix_journal_name(self):
     """Test journal name handling."""
     self.assertEqual(fix_journal_name("A&A", journal_mappings),
                      ('Astron.Astrophys.', ""))
     self.assertEqual(fix_journal_name("A&A B", journal_mappings),
                      ('Astron.Astrophys.', "B"))
     self.assertEqual(fix_journal_name("A&A.B", journal_mappings),
                      ('A&A.', "B"))
     self.assertEqual(fix_journal_name("A&AB.", journal_mappings),
                      ("A&AB.", ""))
Example #2
0
 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link, authors, year, source,
                volume, page)
 def _get_references(self):
     for ref in self.document.getElementsByTagName("ref"):
         label = ref.getAttribute("id")
         label = sub(r"\D", "", label)
         text_ref = ""
         ext_link = ""
         for mixed in ref.getElementsByTagName("mixed-citation"):
             ref_type = mixed.getAttribute("publication-type")
             if ref_type == "thesis":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "conf-proc":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
             elif ref_type == "other" or ref_type == "web":
                 text_ref = get_value_in_tag(ref, "mixed-citation")
                 ext_link = get_value_in_tag(mixed, "ext-link")
             elif ref_type == "book":
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName("string-name"):
             surname = get_value_in_tag(auth, "surname")
             given_names = get_value_in_tag(auth, "given-names")
             given_names = collapse_initials(given_names)
             authors.append("%s, %s" % (surname, given_names))
         year = get_value_in_tag(ref, "year")
         source = get_value_in_tag(ref, "source")
         volume = get_value_in_tag(ref, "volume")
         page = get_value_in_tag(ref, "fpage")
         if ref_type == "journal":
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield label, ref_type, text_ref, ext_link, authors, year, source, volume, page
 def _get_references(self):
     for ref in self.document.getElementsByTagName('ref'):
         label = ref.getAttribute('id')
         label = sub(r'\D', '', label)
         text_ref = ''
         ext_link = ''
         for mixed in ref.getElementsByTagName('mixed-citation'):
             ref_type = mixed.getAttribute('publication-type')
             if ref_type == 'thesis':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'conf-proc':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
             elif ref_type == 'other' or ref_type == 'web':
                 text_ref = get_value_in_tag(ref, 'mixed-citation')
                 ext_link = get_value_in_tag(mixed, 'ext-link')
             elif ref_type == 'book':
                 text_ref = xml_to_text(mixed)
         authors = []
         for auth in ref.getElementsByTagName('string-name'):
             surname = get_value_in_tag(auth, 'surname')
             given_names = get_value_in_tag(auth, 'given-names')
             given_names = collapse_initials(given_names)
             authors.append('%s, %s' % (surname, given_names))
         year = get_value_in_tag(ref, 'year')
         source = get_value_in_tag(ref, 'source')
         volume = get_value_in_tag(ref, 'volume')
         page = get_value_in_tag(ref, 'fpage')
         if ref_type == 'journal':
             source, vol = fix_journal_name(source, self.journal_mappings)
             if vol:
                 volume = vol + volume
         yield (label, ref_type, text_ref, ext_link,
                authors, year, source, volume, page)
Example #5
0
 def _get_publition_information(self):
     journal = self._get_journal()
     date = self._get_date()
     doi = self._get_doi()
     journal, volume = fix_journal_name(journal, self.journal_mappings)
     article_id = get_value_in_tag(self.document, 'elocation-id')
     volume += get_value_in_tag(self.document, 'volume')
     issue = get_value_in_tag(self.document, 'issue')
     year = get_value_in_tag(self.document, 'copyright-year')
     return (journal, volume, issue, year, date, doi, article_id)
Example #6
0
 def get_publication_information(self, xml_doc, path='', timeout=60):
     if self.CONSYN:
         publication = get_value_in_tag(xml_doc, "prism:publicationName")
         doi = get_value_in_tag(xml_doc, "prism:doi")
         issn = get_value_in_tag(xml_doc, "prism:issn")
         issue = get_value_in_tag(xml_doc, "prism:number")
         first_page = get_value_in_tag(xml_doc, "prism:startingPage")
         last_page = get_value_in_tag(xml_doc, "prism:endingPage")
         journal = publication.split(",")[0]
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         try:
             vol = publication.split(",")[1].strip()
             if vol.startswith("Section"):
                 vol = vol[7:].strip()
             if vol and not volume:
                 volume = vol
         except IndexError:
             pass
         vol = get_value_in_tag(xml_doc, "prism:volume")
         if vol is "" and path is not "":
             # if volume is not present try to harvest it
             try:
                 session = requests.session()
                 url = 'http://www.sciencedirect.com/science/article/pii'\
                       + path.split('/')[-1]
                 headers = {'user-agent': make_user_agent()}
                 r = session.get(url, headers=headers, timeout=timeout)
                 parsed_html = BeautifulSoup(r.text)
                 info = parsed_html.body.find('p',
                                              attrs={
                                                  'class': 'volIssue'
                                              }).text.split()
                 for s in info:
                     if unicode(s).find(u'\xe2') > 0:
                         first_page = s.rsplit(u'\xe2')[0]
                         last_page = s.rsplit(u'\x93')[1]
                 if info[1].lower() != 'online':
                     vol = info[1][:-1]
             except:
                 pass
         if vol:
             volume += vol
         start_date = self.get_publication_date(xml_doc)
         year = start_date.split("-")[0]
         doi = get_value_in_tag(xml_doc, "ce:doi")
         return (journal, issn, volume, issue, first_page, last_page, year,
                 start_date, doi)
     else:
         doi = self._get_doi(xml_doc)
         try:
             return self._dois[doi] + (doi, )
         except KeyError:
             return ('', '', '', '', '', '', '', '', doi)
Example #7
0
 def _get_publication_information(self):
     journal = self._get_journal()
     date = self._get_date()
     doi = self._get_doi()
     issue = get_value_in_tag(self.document, 'issue')
     journal, volume = fix_journal_name(journal, self.journal_mappings)
     volume += get_value_in_tag(self.document, 'volume')
     page = get_value_in_tag(self.document, 'elocation-id')
     fpage = get_value_in_tag(self.document, 'fpage')
     lpage = get_value_in_tag(self.document, 'lpage')
     year = date[:4]
     return (journal, volume, issue, year, date, doi, page, fpage, lpage)
Example #8
0
 def _get_publication_information(self):
     journal = self._get_journal()
     date = self._get_date()
     doi = self._get_doi()
     issue = get_value_in_tag(self.document, 'issue')
     journal, volume = fix_journal_name(journal, self.journal_mappings)
     volume += get_value_in_tag(self.document, 'volume')
     page = get_value_in_tag(self.document, 'elocation-id')
     fpage = get_value_in_tag(self.document, 'fpage')
     lpage = get_value_in_tag(self.document, 'lpage')
     year = date[:4]
     return (journal, volume, issue, year, date, doi, page, fpage, lpage)
 def get_publication_information(self, xml_doc, path='', timeout=60):
     if self.CONSYN:
         publication = get_value_in_tag(xml_doc, "prism:publicationName")
         doi = get_value_in_tag(xml_doc, "prism:doi")
         issn = get_value_in_tag(xml_doc, "prism:issn")
         issue = get_value_in_tag(xml_doc, "prism:number")
         first_page = get_value_in_tag(xml_doc, "prism:startingPage")
         last_page = get_value_in_tag(xml_doc, "prism:endingPage")
         journal = publication.split(",")[0]
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         try:
             vol = publication.split(",")[1].strip()
             if vol.startswith("Section"):
                 vol = vol[7:].strip()
             if vol and not volume:
                 volume = vol
         except IndexError:
             pass
         vol = get_value_in_tag(xml_doc, "prism:volume")
         if vol is "" and path is not "":
             # if volume is not present try to harvest it
             try:
                 session = requests.session()
                 url = 'http://www.sciencedirect.com/science/article/pii'\
                       + path.split('/')[-1]
                 headers = {'user-agent': make_user_agent()}
                 r = session.get(url, headers=headers, timeout=timeout)
                 parsed_html = BeautifulSoup(r.text)
                 info = parsed_html.body.find(
                     'p', attrs={'class': 'volIssue'}).text.split()
                 for s in info:
                     if unicode(s).find(u'\xe2') > 0:
                         first_page = s.rsplit(u'\xe2')[0]
                         last_page = s.rsplit(u'\x93')[1]
                 if info[1].lower() != 'online':
                     vol = info[1][:-1]
             except:
                 pass
         if vol:
             volume += vol
         start_date = self.get_publication_date(xml_doc)
         year = start_date.split("-")[0]
         doi = get_value_in_tag(xml_doc, "ce:doi")
         return (journal, issn, volume, issue, first_page,
                 last_page, year, start_date, doi)
     else:
         doi = self._get_doi(xml_doc)
         try:
             return self._dois[doi] + (doi, )
         except KeyError:
             return ('', '', '', '', '', '', '', '', doi)
Example #10
0
 def _add_references(self, xml_doc, rec):
     if self.CONSYN:
         for label, authors, doi, issue, page, title, volume, year,\
                 textref, ext_link, isjournal, comment, journal, publisher,\
                 editors, book_title in self.get_references(xml_doc):
             subfields = []
             if textref and not authors:
                 textref = textref.replace('\"', '\'')
                 ref_xml = extract_references_from_string_xml(textref)
                 dom = xml.dom.minidom.parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     if code == 's':
                         try:
                             journal = data.split(',')[0]
                             journal, vol = fix_journal_name(journal, self.journal_mappings)
                             vol += data.split(',')[1]
                             try:
                                 page = data.split(',')[2]
                                 journal = journal + "," + vol + "," + page
                                 subfields.append(('s', journal))
                             except IndexError:
                                 journal = journal + "," + vol
                                 subfields.append(('s', journal))
                         except IndexError:
                             subfields.append(('s', data))
                     else:
                         subfields.append((code, data))
                 if label:
                     label = re.sub("[\[\].)]", "", label)
                     subfields.append(('o', label))
                 if subfields:
                     record_add_field(rec, '999', ind1='C', ind2='5',
                                      subfields=subfields)
             else:
                 if doi:
                     subfields.append(('a', doi))
                 for author in authors:
                     subfields.append(('h', author))
                 if issue:
                     subfields.append(('n', issue))
                 if ext_link:
                     subfields.append(('r', ext_link))
                 if title:
                     subfields.append(('t', title))
                 elif textref:
                     subfields.append(('m', textref))
                 if publisher:
                     subfields.append(('p', publisher))
                 if volume:
                     subfields.append(('v', volume))
                 if year:
                     subfields.append(('y', year))
                 if comment:
                     subfields.append(('m', comment))
                 for editor in editors:
                     subfields.append(('e', editor))
                 if book_title:
                     subfields.append(('q', book_title))
                 if label:
                     label = re.sub("[\[\].)]", "", label)
                     subfields.append(('o', label))
                 if journal:
                     journal, vol = fix_journal_name(journal, self.journal_mappings)
                     volume = vol + volume
                     if volume and page:
                         journal = journal + "," + volume + "," + page
                         subfields.append(('s', journal))
                     elif volume:
                         journal = journal + "," + volume
                         subfields.append(('s', journal))
                     else:
                         subfields.append(('s', journal))
                 if subfields:
                     record_add_field(rec, '999', ind1='C', ind2='5',
                                      subfields=subfields)
     else:
         for label, authors, doi, issue, page, title, volume, year,\
                 textref, ext_link in self.get_references(xml_doc):
             subfields = []
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             if issue:
                 subfields.append(('n', issue))
             if label:
                 subfields.append(('o', label))
             if page:
                 subfields.append(('p', page))
             if ext_link:
                 subfields.append(('r', ext_link))
             if title and volume and year and page:
                 subfields.append(
                     ('s', '%s %s (%s) %s' % (title, volume, year, page)))
             elif textref:
                 subfields.append(('m', textref))
             if title:
                 subfields.append(('t', title))
             if volume:
                 subfields.append(('v', volume))
             if year:
                 subfields.append(('y', year))
             if subfields:
                 record_add_field(
                     rec, '999', ind1='C', ind2='5', subfields=subfields)
 def _add_references(self, xml_doc, rec, refextract_callback=None):
     for label, authors, doi, issue, page, title, volume, year,\
             textref, ext_link, isjournal, comment, journal, publisher,\
             editors, book_title in self.get_references(xml_doc):
         subfields = []
         if textref and not authors:
             textref = textref.replace('\"', '\'')
             if refextract_callback:
                 ref_xml = refextract_callback(textref)
                 dom = xml.dom.minidom.parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     if code == 'r':
                         data = fix_dashes(data)
                     subfields.append((code, data))
                 if fields:
                     subfields.append(('9', 'refextract'))
             else:
                 subfields.append(('m', textref))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if subfields:
                 record_add_field(rec, '999', ind1='C', ind2='5',
                                  subfields=subfields)
         else:
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             if ext_link:
                 ext_link = fix_dashes(ext_link)
                 subfields.append(('r', ext_link))
             if title:
                 subfields.append(('t', title))
             elif textref:
                 subfields.append(('m', textref))
             if publisher:
                 subfields.append(('p', publisher))
             if volume:
                 subfields.append(('v', volume))
             if year:
                 subfields.append(('y', year))
             if comment:
                 subfields.append(('m', comment))
             for editor in editors:
                 subfields.append(('e', editor))
             if book_title:
                 subfields.append(('q', book_title))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if journal:
                 journal, vol = fix_journal_name(journal,
                                                 self.journal_mappings)
                 volume = vol + volume
                 if volume and page:
                     journal = journal + "," + volume + "," + page
                     subfields.append(('s', journal))
                 elif volume:
                     journal = journal + "," + volume
                     subfields.append(('s', journal))
                 else:
                     subfields.append(('s', journal))
             if textref:
                 subfields.append(('m', textref))
             if subfields:
                 record_add_field(rec, '999', ind1='C', ind2='5',
                                  subfields=subfields)
    def get_record(self, path=None, no_pdf=False,
                   test=False, refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec, '260', subfields=[('c', start_date),
                                                    ('t', 'published')])
        else:
            record_add_field(
                rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec, '540', subfields=[('a', license),
                                                    ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get(
                'given_name') or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec, '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            if keywords:
                for keyword in keywords:
                    record_add_field(rec, '653', ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec, '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' %
                                       (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec, '856', ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec, 'FFT', subfields=[('a', path),
                                                            ('t', 'INSPIRE-PUBLIC'),
                                                            ('d', 'Fulltext')])
                else:
                    record_add_field(rec, 'FFT', subfields=[('a', path),
                                                            ('t', 'Elsevier'),
                                                            ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(
                        rec, '653', ind1='1', subfields=[('a', keyword),
                                    ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,)
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(
                            ".pdf;pdfa", exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec, 'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: '
                                   + doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec, 'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s'
                                          % (doi,))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec, 'FFT', subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi,)
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec, '980', subfields=[('a', 'SCOAP3'),
                                                        ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #13
0
    def get_record(self,
                   path=None,
                   no_pdf=False,
                   test=False,
                   refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', start_date), ('t', 'published')])
        else:
            record_add_field(rec,
                             '260',
                             subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec,
                             '540',
                             subfields=[('a', license), ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get('given_name')
                           or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec,
                                     '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            for keyword in keywords:
                record_add_field(rec,
                                 '653',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' % (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec,
                                     '856',
                                     ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path),
                                                ('t', 'INSPIRE-PUBLIC'),
                                                ('d', 'Fulltext')])
                else:
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path), ('t', 'Elsevier'),
                                                ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(rec,
                                     '653',
                                     ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, )
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(".pdf;pdfa",
                                                     exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: ' +
                                   doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s' %
                                          (doi, ))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi, )
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec,
                                 '980',
                                 subfields=[('a', 'SCOAP3'),
                                            ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #14
0
 def test_fix_journal_name(self):
     """Test journal name handling."""
     self.assertEqual(fix_journal_name("A&A", journal_mappings), ('Astron.Astrophys.', ""))
     self.assertEqual(fix_journal_name("A&A B", journal_mappings), ('Astron.Astrophys.', "B"))
     self.assertEqual(fix_journal_name("A&A.B", journal_mappings), ('Astron.Astrophys.', "B"))
     self.assertEqual(fix_journal_name("A&AB.", journal_mappings), ("A&AB.", ""))
Example #15
0
 def _add_references(self, xml_doc, rec, refextract_callback=None):
     for label, authors, doi, issue, page, title, volume, year,\
             textref, ext_link, isjournal, comment, journal, publisher,\
             editors, book_title in self.get_references(xml_doc):
         subfields = []
         if textref and not authors:
             textref = textref.replace('\"', '\'')
             if refextract_callback:
                 ref_xml = refextract_callback(textref)
                 dom = xml.dom.minidom.parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     if code == 'r':
                         data = fix_dashes(data)
                     subfields.append((code, data))
                 if fields:
                     subfields.append(('9', 'refextract'))
             else:
                 subfields.append(('m', textref))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if subfields:
                 record_add_field(rec,
                                  '999',
                                  ind1='C',
                                  ind2='5',
                                  subfields=subfields)
         else:
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             if ext_link:
                 ext_link = fix_dashes(ext_link)
                 subfields.append(('r', ext_link))
             if title:
                 subfields.append(('t', title))
             elif textref:
                 subfields.append(('m', textref))
             if publisher:
                 subfields.append(('p', publisher))
             if volume:
                 subfields.append(('v', volume))
             if year:
                 subfields.append(('y', year))
             if comment:
                 subfields.append(('m', comment))
             for editor in editors:
                 subfields.append(('e', editor))
             if book_title:
                 subfields.append(('q', book_title))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if journal:
                 journal, vol = fix_journal_name(journal,
                                                 self.journal_mappings)
                 volume = vol + volume
                 if volume and page:
                     journal = journal + "," + volume + "," + page
                     subfields.append(('s', journal))
                 elif volume:
                     journal = journal + "," + volume
                     subfields.append(('s', journal))
                 else:
                     subfields.append(('s', journal))
             if textref:
                 subfields.append(('m', textref))
             if subfields:
                 record_add_field(rec,
                                  '999',
                                  ind1='C',
                                  ind2='5',
                                  subfields=subfields)
Example #16
0
 def _get_reference(self, ref):
     """Retrieve the data for a reference."""
     label = get_value_in_tag(ref, 'label')
     label = re.sub('\D', '', label)
     for innerref in ref.getElementsByTagName('mixed-citation'):
         ref_type = innerref.getAttribute('publication-type')
         institution = get_value_in_tag(innerref, 'institution')
         report_no = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'other':
                 if tag.hasChildNodes():
                     report_no = get_all_text(tag)
         doi = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'doi':
                 doi = xml_to_text(tag)
         collaboration = get_value_in_tag(innerref, 'collab')
         authors = []
         person_groups = innerref.getElementsByTagName('person-group')
         for author_group in person_groups:
             if author_group.getAttribute('person-group-type') == 'author':
                 for author in author_group.getElementsByTagName(
                         'string-name'):
                     if author.hasChildNodes():
                         authors.append(get_all_text(author))
         editors = []
         for editor_group in person_groups:
             if editor_group.getAttribute('person-group-type') == 'editor':
                 for editor in editor_group.getElementsByTagName(
                         'string-name'):
                     if editor.hasChildNodes():
                         editors.append(get_all_text(editor))
         journal = get_value_in_tag(innerref, 'source')
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         volume += get_value_in_tag(innerref, 'volume')
         if journal == 'J.High Energy Phys.' or journal == 'JHEP':
             issue = get_value_in_tag(innerref, 'issue')
             volume = volume[2:] + issue
             journal = 'JHEP'
         page = get_value_in_tag(innerref, 'page-range')
         year = get_value_in_tag(innerref, 'year')
         external_link = get_value_in_tag(innerref, 'ext-link')
         arxiv = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'arxiv':
                 if tag.hasChildNodes():
                     arxiv = get_all_text(tag)
         arxiv = format_arxiv_id(arxiv)
         publisher = get_value_in_tag(innerref, 'publisher-name')
         publisher_location = get_value_in_tag(innerref, 'publisher-loc')
         if publisher_location:
             publisher = publisher_location + ': ' + publisher
         unstructured_text = []
         for child in innerref.childNodes:
             if child.nodeType == child.TEXT_NODE:
                 text = child.nodeValue.strip()
                 text = re.sub(r'[\[\]\(\.;\)]', '', text).strip()
                 if text.startswith(','):
                     text = text[1:].strip()
                 if text.endswith('Report No'):
                     text = institution + " " + text
                     institution = ''
                     text = text.strip()
                 elif text.endswith(' ed'):
                     text += '.'
                 elif text.endswith('PhD thesis,'):
                     if institution:
                         text += ' ' + institution
                         institution = ''
                     else:
                         text = text[:-1]
                 elif text.startswith('Seminar,'):
                     article_title = get_value_in_tag(
                         innerref, 'article-title')
                     text = institution + " Seminar, \"" + article_title + "\""
                     institution = ''
                 elif text == u'\u201d':
                     text = ''
                 ignore_text = ['in', 'pp', 'edited by']
                 if text.startswith('Vol'):
                     temp = re.sub(r'\D', '', text)
                     if temp:
                         volume += temp
                 elif len(text) > 1 and text not in ignore_text\
                         and not (text.isdigit() or text[:-1].isdigit()):
                     unstructured_text.append(text)
         if unstructured_text:
             unstructured_text = " ".join(unstructured_text)
         if ref_type == 'book':
             if volume and not volume.lower().startswith('vol'):
                 volume = 'Vol ' + volume
             if volume and page:
                 volume = volume + ', pp ' + page
         yield ref_type, doi, authors, collaboration, journal, volume, page, year,\
             label, arxiv, publisher, institution, unstructured_text, external_link,\
             report_no, editors
Example #17
0
    def get_record(self, path=None, no_pdf=False):
        xml_doc = self.get_article(path)
        rec = {}
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        journal, dummy, volume, issue, first_page,\
            last_page, year, start_date, doi = self.get_publication_information(
                xml_doc)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec, '260', subfields=[('c', start_date)])
        else:
            record_add_field(
                rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get(
                'given_name') or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))
            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)
        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'Elsevier')])
        copyrightt = self.get_copyright(xml_doc)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyrightt)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            if keywords:
                for keyword in keywords:
                    record_add_field(
                        rec, '653', ind1='1', subfields=[('a', keyword),
                                    ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(), self.journal_mappings)            
            subfields = []
            doctype = self.get_doctype(xml_doc)
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' % (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))           
            if volume:
                subfields.append(('v', volume))
            if issue:
                subfields.append(('n', issue))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'),
                                                    ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(
                        rec, '653', ind1='1', subfields=[('a', keyword),
                                    ('9', 'author')])
            record_add_field(rec, '773', subfields=[('p', journal),
                                                    ('v', volume),
                                                    ('n', issue),
                                                    ('c', '%s-%s' % (
                                                        first_page, last_page)),
                                                    ('y', year)])
        self._add_references(xml_doc, rec)
        if self.CONSYN:
            record_add_field(rec, 'FFT', subfields=[('a', path),
                                                    ('t', 'Elsevier'),
                                                    ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            if doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
        else:
            if not no_pdf:
                from invenio.search_engine import search_pattern
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,)
                prev_version = search_pattern(p=query)
                from invenio.bibdocfile import BibRecDocs
                old_pdf = False

                if prev_version:
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(
                            ".pdf;pdfa", exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec, 'FFT', subfields=[('a', pdf_path),
                                                                ('n', 'main'),
                                                                ('f', '.pdf;pdfa')])
                        message = 'Leaving previously delivered PDF/A for: ' + doi
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec, 'FFT', subfields=[('a', path),
                                                                ('n', 'main'),
                                                                ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s' % (doi,))
                    elif exists(join(path, 'main.pdf')):
                        path = join(path, 'main.pdf')
                        record_add_field(rec, 'FFT', subfields=[('a', path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi,)
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', path)])
                record_add_field(rec, '980', subfields=[('a', 'SCOAP3'),
                                                        ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #18
0
 def get_publication_information(self, xml_doc):
     if self.CONSYN:
         publication = get_value_in_tag(xml_doc, "prism:publicationName")
         doi = get_value_in_tag(xml_doc, "prism:doi")
         issn = get_value_in_tag(xml_doc, "prism:issn")
         issue = get_value_in_tag(xml_doc, "prism:number")
         first_page = get_value_in_tag(xml_doc, "prism:startingPage")
         last_page = get_value_in_tag(xml_doc, "prism:endingPage")
         journal = publication.split(",")[0]
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         try:
             vol = publication.split(",")[1].strip()
             if vol.startswith("Section"):
                 vol = vol[7:].strip()
             if vol and not volume:
                 volume = vol
         except IndexError:
             pass
         vol = get_value_in_tag(xml_doc, "prism:volume")
         if vol is "":
             # if volume is not present try to harvest it
             try:
                 session = requests.session()
                 r = session.get("http://dx.doi.org/" + doi)
                 parsed_html = BeautifulSoup(r.text)
                 info = parsed_html.body.find(
                     'p', attrs={'class': 'volIssue'}).text.split()
                 for s in info:
                     if unicode(s).find(u'\xe2') > 0:
                         first_page = s.rsplit(u'\xe2')[0]
                         last_page = s.rsplit(u'\x93')[1]
                 if info[1].lower() != 'online':
                     vol = info[1][:-1]
             except:
                 pass
         if vol:
             volume += vol
         year = xml_doc.getElementsByTagName(
             'ce:copyright')[0].getAttribute("year")
         year = year.encode('utf-8')
         start_date = get_value_in_tag(xml_doc, "prism:coverDate")
         if len(xml_doc.getElementsByTagName('ce:date-accepted')) > 0:
             full_date = xml_doc.getElementsByTagName('ce:date-accepted')[0]
             y = full_date.getAttribute('year').encode('utf-8')
             m = full_date.getAttribute('month').encode('utf-8').zfill(2)
             d = full_date.getAttribute('day').encode('utf-8').zfill(2)
             start_date = "%s-%s-%s" % (y, m, d)
         elif len(start_date) is 8:
             start_date = time.strftime(
                 '%Y-%m-%d', time.strptime(start_date, '%Y%m%d'))
         elif len(start_date) is 6:
             start_date = time.strftime(
                 '%Y-%m', time.strptime(start_date, '%Y%m'))
         doi = get_value_in_tag(xml_doc, "ce:doi")
         return (journal, issn, volume, issue, first_page,
                 last_page, year, start_date, doi)
     else:
         doi = self._get_doi(xml_doc)
         try:
             return self._dois[doi] + (doi, )
         except KeyError:
             return ('', '', '', '', '', '', '', '', doi)
Example #19
0
    def get_record_rich(self, filename, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName('ArticleID')
        for article in articles:
            article_type = article.getAttribute('Type')
            if not article_type == 'Article':
                return ''
            doi = get_value_in_tag(self.document, 'DOI')
            date = ''
            for tag in self.document.getElementsByTagName('Accepted'):
                year = get_value_in_tag(tag, 'Year')
                month = get_value_in_tag(tag, 'Month').zfill(2)
                day = get_value_in_tag(tag, 'Day').zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName('OnlineDate'):
                    year = get_value_in_tag(tag, 'Year')
                    month = get_value_in_tag(tag, 'Month').zfill(2)
                    day = get_value_in_tag(tag, 'Day').zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, 'FirstPage')
            last_page = get_value_in_tag(article, 'LastPage')
            subjects = article.getElementsByTagName('Keyword')
            subjects = map(xml_to_text, subjects)
            subject = ', '.join(subjects)
            copyright_statement = get_value_in_tag(article, 'Copyright')
        journal = get_value_in_tag(self.document, 'JournalTitle')
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName('IssueID')
        for issue in issues:
            volume += get_value_in_tag(issue, 'Volume')
            year = get_value_in_tag(issue, 'Year')
        title = get_value_in_tag(self.document, 'Title')
        authors = self.document.getElementsByTagName('Author')
        affiliations = self.document.getElementsByTagName('Affiliation')

        def affiliation_pair(a):
            return a.getAttribute('ID'), get_value_in_tag(
                a, 'UnstructuredAffiliation')

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, 'LastName')
            first_name = get_value_in_tag(a, 'FirstName')
            middle_name = get_value_in_tag(a, 'MiddleName')
            if middle_name:
                name = '%s, %s %s' % (surname, first_name, middle_name)
            else:
                name = '%s, %s' % (surname, first_name)
            try:
                affid = a.getElementsByTagName(
                    'AffiliationID')[0].getAttribute('Label')
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ''
            except KeyError:
                affiliation = ''
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, 'Abstract')
        references = self.document.getElementsByTagName('Bibliomixed')

        for reference in references:
            subfields = []
            label = reference.getAttribute('N')
            if label:
                subfields.append(('o', label))
            bibliosets = reference.getElementsByTagName('Biblioset')
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, 'Date')
                ref_journal = get_value_in_tag(tag, 'JournalShortTitle')
                ref_journal, ref_volume = fix_journal_name(
                    ref_journal, self.journal_mappings)
                ref_volume += get_value_in_tag(tag, 'Volume')
                ref_page = get_value_in_tag(tag, 'ArtPageNums')
                if ref_year:
                    subfields.append(('y', ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(
                        ('s',
                         '%s,%s,%s' % (ref_journal, ref_volume, ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            if ref_extract_callback:
                ref_xml = ref_extract_callback(text_ref)
                dom = parseString(ref_xml)
                fields = dom.getElementsByTagName("datafield")[0]
                fields = fields.getElementsByTagName("subfield")
                if fields:
                    subfields.append(('9', 'refextract'))
                for field in fields:
                    data = field.firstChild.data
                    code = field.getAttribute("code")
                    if code == 'm' and bibliosets:
                        continue
                    else:
                        subfields.append((code, data))
            else:
                subfields.append(('m', text_ref))
            if subfields:
                record_add_field(rec,
                                 '999',
                                 ind1='C',
                                 ind2='5',
                                 subfields=subfields)

        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'EDPSciences')])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '700', subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(('c', '%s-%s' % (first_page, last_page)))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        if copyright_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', copyright_statement)])
        if subject:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'EDPSciences'), ('a', subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #20
0
 def _get_reference(self, ref):
     """Retrieve the data for a reference."""
     label = get_value_in_tag(ref, 'label')
     label = re.sub('\D', '', label)
     for innerref in ref.getElementsByTagName('mixed-citation'):
         ref_type = innerref.getAttribute('publication-type')
         institution = get_value_in_tag(innerref, 'institution')
         report_no = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'other':
                 if tag.hasChildNodes():
                     report_no = get_all_text(tag)
         doi = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'doi':
                 doi = xml_to_text(tag)
         collaboration = get_value_in_tag(innerref, 'collab')
         authors = []
         person_groups = innerref.getElementsByTagName('person-group')
         for author_group in person_groups:
             if author_group.getAttribute('person-group-type') == 'author':
                 for author in author_group.getElementsByTagName('string-name'):
                     if author.hasChildNodes():
                         authors.append(get_all_text(author))
         editors = []
         for editor_group in person_groups:
             if editor_group.getAttribute('person-group-type') == 'editor':
                 for editor in editor_group.getElementsByTagName('string-name'):
                     if editor.hasChildNodes():
                         editors.append(get_all_text(editor))
         journal = get_value_in_tag(innerref, 'source')
         journal, volume = fix_journal_name(journal, self.journal_mappings)
         volume += get_value_in_tag(innerref, 'volume')
         if journal == 'J.High Energy Phys.' or journal == 'JHEP':
             issue = get_value_in_tag(innerref, 'issue')
             volume = volume[2:] + issue
             journal = 'JHEP'
         page = get_value_in_tag(innerref, 'page-range')
         year = get_value_in_tag(innerref, 'year')
         external_link = get_value_in_tag(innerref, 'ext-link')
         arxiv = ''
         for tag in innerref.getElementsByTagName('pub-id'):
             if tag.getAttribute('pub-id-type') == 'arxiv':
                 if tag.hasChildNodes():
                     arxiv = get_all_text(tag)
         arxiv = format_arxiv_id(arxiv)
         publisher = get_value_in_tag(innerref, 'publisher-name')
         publisher_location = get_value_in_tag(innerref, 'publisher-loc')
         if publisher_location:
             publisher = publisher_location + ': ' + publisher
         unstructured_text = []
         for child in innerref.childNodes:
             if child.nodeType == child.TEXT_NODE:
                 text = child.nodeValue.strip()
                 text = re.sub(r'[\[\]\(\.;\)]', '', text).strip()
                 if text.startswith(','):
                     text = text[1:].strip()
                 if text.endswith('Report No'):
                     text = institution + " " + text
                     institution = ''
                     text = text.strip()
                 elif text.endswith(' ed'):
                     text += '.'
                 elif text.endswith('PhD thesis,'):
                     if institution:
                         text += ' ' + institution
                         institution = ''
                     else:
                         text = text[:-1]
                 elif text.startswith('Seminar,'):
                     article_title = get_value_in_tag(innerref, 'article-title')
                     text = institution + " Seminar, \"" + article_title + "\""
                     institution = ''
                 elif text == u'\u201d':
                     text = ''
                 ignore_text = ['in', 'pp', 'edited by']
                 if text.startswith('Vol'):
                     temp = re.sub(r'\D', '', text)
                     if temp:
                         volume += temp
                 elif len(text) > 1 and text not in ignore_text\
                         and not (text.isdigit() or text[:-1].isdigit()):
                     unstructured_text.append(text)
         if unstructured_text:
             unstructured_text = " ".join(unstructured_text)
         if ref_type == 'book':
             if volume and not volume.lower().startswith('vol'):
                 volume = 'Vol ' + volume
             if volume and page:
                 volume = volume + ', pp ' + page
         yield ref_type, doi, authors, collaboration, journal, volume, page, year,\
             label, arxiv, publisher, institution, unstructured_text, external_link,\
             report_no, editors
    def get_record_rich(self, filename, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName('ArticleID')
        for article in articles:
            article_type = article.getAttribute('Type')
            if not article_type == 'Article':
                return ''
            doi = get_value_in_tag(self.document, 'DOI')
            date = ''
            for tag in self.document.getElementsByTagName('Accepted'):
                year = get_value_in_tag(tag, 'Year')
                month = get_value_in_tag(tag, 'Month').zfill(2)
                day = get_value_in_tag(tag, 'Day').zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName('OnlineDate'):
                    year = get_value_in_tag(tag, 'Year')
                    month = get_value_in_tag(tag, 'Month').zfill(2)
                    day = get_value_in_tag(tag, 'Day').zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, 'FirstPage')
            last_page = get_value_in_tag(article, 'LastPage')
            subjects = article.getElementsByTagName('Keyword')
            subjects = map(xml_to_text, subjects)
            subject = ', '.join(subjects)
            copyright_statement = get_value_in_tag(article, 'Copyright')
        journal = get_value_in_tag(self.document, 'JournalTitle')
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName('IssueID')
        for issue in issues:
            volume += get_value_in_tag(issue, 'Volume')
            year = get_value_in_tag(issue, 'Year')
        title = get_value_in_tag(self.document, 'Title')
        authors = self.document.getElementsByTagName('Author')
        affiliations = self.document.getElementsByTagName('Affiliation')

        def affiliation_pair(a):
            return a.getAttribute('ID'), get_value_in_tag(
                a, 'UnstructuredAffiliation'
            )

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, 'LastName')
            first_name = get_value_in_tag(a, 'FirstName')
            middle_name = get_value_in_tag(a, 'MiddleName')
            if middle_name:
                name = '%s, %s %s' % (surname, first_name, middle_name)
            else:
                name = '%s, %s' % (surname, first_name)
            try:
                affid = a.getElementsByTagName(
                    'AffiliationID'
                )[0].getAttribute('Label')
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ''
            except KeyError:
                affiliation = ''
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, 'Abstract')
        references = self.document.getElementsByTagName('Bibliomixed')

        for reference in references:
            subfields = []
            label = reference.getAttribute('N')
            if label:
                subfields.append(('o', label))
            bibliosets = reference.getElementsByTagName('Biblioset')
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, 'Date')
                ref_journal = get_value_in_tag(tag, 'JournalShortTitle')
                ref_journal, ref_volume = fix_journal_name(
                    ref_journal, self.journal_mappings
                )
                ref_volume += get_value_in_tag(tag, 'Volume')
                ref_page = get_value_in_tag(tag, 'ArtPageNums')
                if ref_year:
                    subfields.append(('y', ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(('s', '%s,%s,%s' % (ref_journal,
                                                         ref_volume,
                                                         ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            if ref_extract_callback:
                ref_xml = ref_extract_callback(text_ref)
                dom = parseString(ref_xml)
                fields = dom.getElementsByTagName("datafield")[0]
                fields = fields.getElementsByTagName("subfield")
                if fields:
                    subfields.append(('9', 'refextract'))
                for field in fields:
                    data = field.firstChild.data
                    code = field.getAttribute("code")
                    if code == 'm' and bibliosets:
                        continue
                    else:
                        subfields.append((code, data))
            else:
                subfields.append(('m', text_ref))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5',
                                 subfields=subfields)

        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if date:
            record_add_field(rec, '260', subfields=[('c', date),
                                                    ('t', 'published')])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'EDPSciences')])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '700', subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(('s', "%s,%s,%s" % (journal,
                                                 volume,
                                                 first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec, '300',
                                 subfields=[('a', str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(('c', '%s-%s' % (first_page,
                                              last_page)))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        if copyright_statement:
            record_add_field(rec, '542',
                             subfields=[('f', copyright_statement)])
        if subject:
            record_add_field(rec, '650', ind1='1', ind2='7',
                             subfields=[('2', 'EDPSciences'),
                                        ('a', subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #22
0
 def test_fix_journal_name(self):
     self.assertEqual(fix_journal_name("A&A", journal_mappings), ('Astron.Astrophys.', ""))
     self.assertEqual(fix_journal_name("A&A B", journal_mappings), ('Astron.Astrophys.', "B"))
     self.assertEqual(fix_journal_name("A&A.B", journal_mappings), ('A&A.', "B"))
     self.assertEqual(fix_journal_name("A&AB.", journal_mappings), ("A&AB.", ""))
    def get_record_rich(self, filename):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName("ArticleID")
        for article in articles:
            article_type = article.getAttribute("Type")
            if not article_type == "Article":
                return ""
            doi = get_value_in_tag(self.document, "DOI")
            date = ""
            for tag in self.document.getElementsByTagName("Accepted"):
                year = get_value_in_tag(tag, "Year")
                month = get_value_in_tag(tag, "Month").zfill(2)
                day = get_value_in_tag(tag, "Day").zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName("OnlineDate"):
                    year = get_value_in_tag(tag, "Year")
                    month = get_value_in_tag(tag, "Month").zfill(2)
                    day = get_value_in_tag(tag, "Day").zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, "FirstPage")
            last_page = get_value_in_tag(article, "LastPage")
            subjects = article.getElementsByTagName("Keyword")
            subjects = map(xml_to_text, subjects)
            subject = ", ".join(subjects)
            copyright_statement = get_value_in_tag(article, "Copyright")
        journal = get_value_in_tag(self.document, "JournalTitle")
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName("IssueID")
        for issue in issues:
            volume += get_value_in_tag(issue, "Volume")
            year = get_value_in_tag(issue, "Year")
        title = get_value_in_tag(self.document, "Title")
        authors = self.document.getElementsByTagName("Author")
        affiliations = self.document.getElementsByTagName("Affiliation")

        def affiliation_pair(a):
            return a.getAttribute("ID"), get_value_in_tag(a, "UnstructuredAffiliation")

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, "LastName")
            first_name = get_value_in_tag(a, "FirstName")
            middle_name = get_value_in_tag(a, "MiddleName")
            if middle_name:
                name = "%s, %s %s" % (surname, first_name, middle_name)
            else:
                name = "%s, %s" % (surname, first_name)
            try:
                affid = a.getElementsByTagName("AffiliationID")[0].getAttribute("Label")
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ""
            except KeyError:
                affiliation = ""
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, "Abstract")
        references = self.document.getElementsByTagName("Bibliomixed")

        for reference in references:
            subfields = []
            label = reference.getAttribute("N")
            if label:
                subfields.append(("o", label))
            bibliosets = reference.getElementsByTagName("Biblioset")
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, "Date")
                ref_journal = get_value_in_tag(tag, "JournalShortTitle")
                ref_journal, ref_volume = fix_journal_name(ref_journal, self.journal_mappings)
                ref_volume += get_value_in_tag(tag, "Volume")
                ref_page = get_value_in_tag(tag, "ArtPageNums")
                if ref_year:
                    subfields.append(("y", ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(("s", "%s,%s,%s" % (ref_journal, ref_volume, ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            ref_xml = extract_references_from_string_xml(text_ref)
            dom = parseString(ref_xml)
            fields = dom.getElementsByTagName("datafield")[0]
            fields = fields.getElementsByTagName("subfield")
            if fields:
                subfields.append(("9", "refextract"))
            for field in fields:
                data = field.firstChild.data
                code = field.getAttribute("code")
                if code == "m" and bibliosets:
                    continue
                else:
                    subfields.append((code, data))
            if subfields:
                record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields)

        if title:
            record_add_field(rec, "245", subfields=[("a", title)])
        if date:
            record_add_field(rec, "260", subfields=[("c", date), ("t", "published")])
        if doi:
            record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")])
        if abstract:
            record_add_field(rec, "520", subfields=[("a", abstract), ("9", "EDPSciences")])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [("a", author[0])]
                if author[1]:
                    subfields.append(("v", author[1]))
                record_add_field(rec, "100", subfields=subfields)
                first_author = False
            else:
                subfields = [("a", author[0])]
                if author[1]:
                    subfields.append(("v", author[1]))
                record_add_field(rec, "700", subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(("s", "%s,%s,%s" % (journal, volume, first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec, "300", subfields=[("a", str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(("c", "%s-%s" % (first_page, last_page)))
        if year:
            subfields.append(("y", year))
        record_add_field(rec, "773", subfields=subfields)
        record_add_field(rec, "980", subfields=[("a", "HEP")])
        if copyright_statement:
            record_add_field(rec, "542", subfields=[("f", copyright_statement)])
        if subject:
            record_add_field(rec, "650", ind1="1", ind2="7", subfields=[("2", "EDPSciences"), ("a", subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""