Exemple #1
0
 def _attach_fulltext(self, rec, doi):
     """Attach fulltext FFT."""
     url = os.path.join(self.url_prefix, doi)
     record_add_field(rec,
                      'FFT',
                      subfields=[('a', url), ('t', 'INSPIRE-PUBLIC'),
                                 ('d', 'Fulltext')])
 def _attach_fulltext(self, rec, doi):
     """Attach fulltext FFT."""
     url = os.path.join(self.url_prefix, doi)
     record_add_field(rec, 'FFT',
                      subfields=[('a', url),
                                 ('t', 'INSPIRE-PUBLIC'),
                                 ('d', 'Fulltext')])
Exemple #3
0
 def test_no_journal_substring(self):
     """Test for correct journal conversion without substring matching."""
     from harvestingkit.bibrecord import record_add_field, record_get_field_values
     from harvestingkit.inspire_cds_package.from_inspire import Inspire2CDS
     rec = {}
     record_add_field(rec, tag='773', subfields=[('p', 'J. Phys.')])
     record_add_field(rec, tag='773', subfields=[('p', 'J.Phys.')])
     r = Inspire2CDS(rec)
     r.update_journals()
     self.assertNotEqual(record_get_field_values(rec, '773', code='p'),
         ['J. Phys.', 'Czechoslov. J. Phys.'])
     self.assertEqual(record_get_field_values(rec, '773', code='p'),
         ['J. Phys.', 'J. Phys.'])
Exemple #4
0
    def get_pdfa_record(self, path=None):
        from invenio.search_engine import perform_request_search
        xml_doc = self.get_article(path)
        rec = create_record()
        dummy, dummy, dummy, dummy, dummy, dummy, dummy,\
            dummy, doi = self.get_publication_information(xml_doc)
        recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' %
                                       (doi, ))
        if recid:
            record_add_field(rec, '001', controlfield_value=recid[0])
        else:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
            message = ('Adding PDF/A. No paper with this DOI: '
                       '%s. Trying to add it anyway.') % (doi, )
            self.logger.error(message)
        try:
            if exists(join(path, 'main_a-2b.pdf')):
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', join(path, 'main_a-2b.pdf')),
                                            ('n', 'main'), ('f', '.pdf;pdfa')])
                self.logger.debug('Adding PDF/A to record: %s' % (doi, ))
            elif exists(join(path, 'main.pdf')):
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', join(path, 'main.pdf'))])
                message = 'No PDF/A in VTEX package for record: ' + doi
                self.logger.debug(message)
            else:
                message = "Record %s doesn't contain PDF file." % (doi, )
                raise MissingFFTError(message)
        except MissingFFTError:
            message = "Elsevier paper: %s is missing PDF." % (doi, )
            register_exception(alert_admin=True, prefix=message)
            self.logger.warning(message)

        ## copy other formats to bibupload file
        if recid:
            from invenio.bibdocfile import BibRecDocs
            record = BibRecDocs(recid[0])
            for bibfile in record.list_latest_files():
                if bibfile.get_format() != '.pdf;pdfa':
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', bibfile.get_full_path()),
                                                ('n', bibfile.get_name()),
                                                ('f', bibfile.get_format())])
        return record_xml_output(rec)
    def get_pdfa_record(self, path=None):
        from invenio.search_engine import perform_request_search
        xml_doc = self.get_article(path)
        rec = create_record()
        dummy, dummy, dummy, dummy, dummy, dummy, dummy,\
            dummy, doi = self.get_publication_information(xml_doc)
        recid = perform_request_search(p='0247_a:"%s" AND NOT 980:"DELETED"' % (doi,))
        if recid:
            record_add_field(rec, '001', controlfield_value=recid[0])
        else:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
            message = ('Adding PDF/A. No paper with this DOI: '
                       '%s. Trying to add it anyway.') % (doi,)
            self.logger.error(message)
        try:
            if exists(join(path, 'main_a-2b.pdf')):
                record_add_field(
                    rec, 'FFT', subfields=[('a', join(path, 'main_a-2b.pdf')),
                                ('n', 'main'),
                        ('f', '.pdf;pdfa')])
                self.logger.debug('Adding PDF/A to record: %s' % (doi,))
            elif exists(join(path, 'main.pdf')):
                record_add_field(
                    rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
                message = 'No PDF/A in VTEX package for record: ' + doi
                self.logger.debug(message)
            else:
                message = "Record %s doesn't contain PDF file." % (doi,)
                raise MissingFFTError(message)
        except MissingFFTError:
            message = "Elsevier paper: %s is missing PDF." % (doi,)
            register_exception(alert_admin=True, prefix=message)
            self.logger.warning(message)

        ## copy other formats to bibupload file
        if recid:
            from invenio.bibdocfile import BibRecDocs
            record = BibRecDocs(recid[0])
            for bibfile in record.list_latest_files():
                if bibfile.get_format() != '.pdf;pdfa':
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', bibfile.get_full_path()),
                                                ('n', bibfile.get_name()),
                                                ('f', bibfile.get_format())]
                                     )
        return record_xml_output(rec)
    def _attach_fulltext(self, rec, doi):
        url = 'http://dx.doi.org/' + doi
        page = requests.get(url)
        #url after redirect
        url = page.url
        page = page.text
        parsed_uri = urlparse(url)
        domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
        page = BeautifulSoup(page)
        try:
            if 'epjconf' in doi:
                div = page.body.find('div', attrs={'id': 'header'})
            else:
                div = page.body.find('div', attrs={
                    'class': 'module_background files'
                })
            links = div.findAll('a')
        except AttributeError:
            return
        for pdf in links:
            if pdf['href'].endswith('pdf'):
                link_to_pdf = domain + pdf['href']
                record_add_field(rec, '856', ind1='4',
                                 subfields=[('u', link_to_pdf),
                                            ('y', 'EDP Sciences server')])
                out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER,
                                  "fulltexts")
                try:
                    makedirs(out_folder)
                    filename = join(out_folder,
                                    link_to_pdf.split('/')[-1])
                except (IOError, OSError):
                    # Problem creating folder
                    filename = None

                filename = download_file(from_url=link_to_pdf,
                                         to_filename=filename,
                                         retry_count=5)
                record_add_field(rec, 'FFT',
                                 subfields=[('a', filename),
                                            ('t', 'INSPIRE-PUBLIC'),
                                            ('d', 'Fulltext')])
Exemple #7
0
    def _attach_fulltext(self, rec, doi):
        url = 'http://dx.doi.org/' + doi
        page = requests.get(url)
        #url after redirect
        url = page.url
        page = page.text
        parsed_uri = urlparse(url)
        domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
        page = BeautifulSoup(page)
        try:
            if 'epjconf' in doi:
                div = page.body.find('div', attrs={'id': 'header'})
            else:
                div = page.body.find(
                    'div', attrs={'class': 'module_background files'})
            links = div.findAll('a')
        except AttributeError:
            return
        for pdf in links:
            if pdf['href'].endswith('pdf'):
                link_to_pdf = domain + pdf['href']
                record_add_field(rec,
                                 '856',
                                 ind1='4',
                                 subfields=[('u', link_to_pdf),
                                            ('y', 'EDP Sciences server')])
                out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts")
                try:
                    makedirs(out_folder)
                    filename = join(out_folder, link_to_pdf.split('/')[-1])
                except (IOError, OSError):
                    # Problem creating folder
                    filename = None

                filename = download_file(from_url=link_to_pdf,
                                         to_filename=filename,
                                         retry_count=5)
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', filename),
                                            ('t', 'INSPIRE-PUBLIC'),
                                            ('d', 'Fulltext')])
Exemple #8
0
 def _add_authors(self, rec):
     authors = self._get_authors()
     affiliations = self._get_affiliations()
     author_emails = self._get_author_emails()
     first_author = True
     for author in authors:
         subfields = [('a', author[0])]
         if author[1]:
             for aff in author[1]:
                 subfields.append(('v', affiliations[aff]))
         if author[2]:
             for note in author[2]:
                 for email in author_emails.get(note, []):
                     if email:
                         subfields.append(('m', email))
         if first_author:
             record_add_field(rec, '100', subfields=subfields)
             first_author = False
         else:
             record_add_field(rec, '700', subfields=subfields)
 def _add_authors(self, rec):
     authors = self._get_authors()
     affiliations = self._get_affiliations()
     author_emails = self._get_author_emails()
     first_author = True
     for author in authors:
         subfields = [('a', author[0])]
         if author[1]:
             for aff in author[1]:
                 subfields.append(('v', affiliations[aff]))
         if author[2]:
             for note in author[2]:
                 for email in author_emails[note]:
                     if email:
                         subfields.append(('m', email))
         if first_author:
             record_add_field(rec, '100', subfields=subfields)
             first_author = False
         else:
             record_add_field(rec, '700', subfields=subfields)
Exemple #10
0
 def _add_references(self, rec, ref_extract_callback=None):
     for label, ref_type, text_ref, ext_link, authors, year, \
             source, volume, page in self._get_references():
         subfields = []
         if label:
             subfields.append(('o', label))
         if text_ref:
             if ref_extract_callback:
                 ref_xml = ref_extract_callback(text_ref)
                 dom = parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     subfields.append((code, data))
                 if fields:
                     subfields.append(('9', 'refextract'))
             else:
                 subfields.append(('m', text_ref))
         if ref_type:
             subfields.append(('d', ref_type))
         if ext_link:
             subfields.append(('u', ext_link))
         for author in authors:
             subfields.append(('h', author))
         if year:
             subfields.append(('y', year))
         if source and volume and page:
             subfields.append(('s', source + "," + volume + "," + page))
         elif source and volume:
             subfields.append(('s', source + "," + volume))
         elif source and page:
             subfields.append(('s', source + "," + page))
         elif source:
             subfields.append(('s', source))
         record_add_field(rec,
                          '999',
                          ind1='C',
                          ind2='5',
                          subfields=subfields)
 def _add_authors(self, rec):
     authors = self._get_authors()
     first_author = True
     collaboration_added = False
     for author in authors:
         subfields = [('a', author[0])]
         if author[1]:
             for aff in author[1]:
                 subfields.append(('v', aff))
         if author[2]:
             for email in author[2]:
                 subfields.append(('m', email))
         if first_author:
             record_add_field(rec, '100', subfields=subfields)
             first_author = False
         else:
             record_add_field(rec, '700', subfields=subfields)
         if author[3] and not collaboration_added:
             collaborations = []
             for collab in author[3]:
                 collab_stripped = collab.replace("for the", "").strip()
                 if collab_stripped not in collaborations:
                     collaborations.append(collab_stripped)
                     record_add_field(rec, '710', subfields=[("g", collab_stripped)])
             collaboration_added = True
Exemple #12
0
 def _add_authors(self, rec):
     authors = self._get_authors()
     first_author = True
     collaboration_added = False
     for author in authors:
         subfields = [('a', author[0])]
         if author[1]:
             for aff in author[1]:
                 subfields.append(('v', aff))
         if author[2]:
             for email in author[2]:
                 subfields.append(('m', email))
         if first_author:
             record_add_field(rec, '100', subfields=subfields)
             first_author = False
         else:
             record_add_field(rec, '700', subfields=subfields)
         if author[3] and not collaboration_added:
             collaborations = []
             for collab in author[3]:
                 collab_stripped = collab.replace("for the", "").strip()
                 if collab_stripped not in collaborations:
                     collaborations.append(collab_stripped)
                     record_add_field(rec,
                                      '710',
                                      subfields=[("g", collab_stripped)])
             collaboration_added = True
 def _add_references(self, rec, ref_extract_callback=None):
     for label, ref_type, text_ref, ext_link, authors, year, \
             source, volume, page in self._get_references():
         subfields = []
         if label:
             subfields.append(('o', label))
         if text_ref:
             if ref_extract_callback:
                 ref_xml = ref_extract_callback(text_ref)
                 dom = parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     subfields.append((code, data))
                 if fields:
                     subfields.append(('9', 'refextract'))
             else:
                 subfields.append(('m', text_ref))
         if ref_type:
             subfields.append(('d', ref_type))
         if ext_link:
             subfields.append(('u', ext_link))
         for author in authors:
             subfields.append(('h', author))
         if year:
             subfields.append(('y', year))
         if source and volume and page:
             subfields.append(('s', source + "," + volume + "," + page))
         elif source and volume:
             subfields.append(('s', source + "," + volume))
         elif source and page:
             subfields.append(('s', source + "," + page))
         elif source:
             subfields.append(('s', source))
         record_add_field(rec, '999', ind1='C', ind2='5',
                          subfields=subfields)
Exemple #14
0
    def get_record(self,
                   path=None,
                   no_pdf=False,
                   test=False,
                   refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', start_date), ('t', 'published')])
        else:
            record_add_field(rec,
                             '260',
                             subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec,
                             '540',
                             subfields=[('a', license), ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get('given_name')
                           or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec,
                                     '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            for keyword in keywords:
                record_add_field(rec,
                                 '653',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' % (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec,
                                     '856',
                                     ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path),
                                                ('t', 'INSPIRE-PUBLIC'),
                                                ('d', 'Fulltext')])
                else:
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path), ('t', 'Elsevier'),
                                                ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(rec,
                                     '653',
                                     ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, )
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(".pdf;pdfa",
                                                     exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: ' +
                                   doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s' %
                                          (doi, ))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi, )
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec,
                                 '980',
                                 subfields=[('a', 'SCOAP3'),
                                            ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Exemple #15
0
 def _add_references(self, rec):
     """ Adds the reference to the record """
     for ref in self.document.getElementsByTagName('ref'):
         for ref_type, doi, authors, collaboration, journal, volume, page, year,\
                 label, arxiv, publisher, institution, unstructured_text,\
                 external_link, report_no, editors in self._get_reference(ref):
             subfields = []
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             for editor in editors:
                 subfields.append(('e', editor))
             if year:
                 subfields.append(('y', year))
             if unstructured_text:
                 if page:
                     subfields.append(
                         ('m', unstructured_text + ', ' + page))
                 else:
                     subfields.append(('m', unstructured_text))
             if collaboration:
                 subfields.append(('c', collaboration))
             if institution:
                 subfields.append(('m', institution))
             if publisher:
                 subfields.append(('p', publisher))
             if arxiv:
                 subfields.append(('r', arxiv))
             if report_no:
                 subfields.append(('r', report_no))
             if external_link:
                 subfields.append(('u', external_link))
             if label:
                 subfields.append(('o', label))
             if ref_type == 'book':
                 if journal:
                     subfields.append(('t', journal))
                 if volume:
                     subfields.append(('m', volume))
                 elif page and not unstructured_text:
                     subfields.append(('m', page))
             else:
                 if volume and page:
                     subfields.append(
                         ('s', journal + "," + volume + "," + page))
                 elif journal:
                     subfields.append(('t', journal))
             if ref_type:
                 subfields.append(('d', ref_type))
             if not subfields:
                 #misc-type references
                 try:
                     r = ref.getElementsByTagName('mixed-citation')[0]
                     text = xml_to_text(r)
                     label = text.split()[0]
                     text = " ".join(text.split()[1:])
                     subfields.append(('s', text))
                     record_add_field(rec,
                                      '999',
                                      ind1='C',
                                      ind2='5',
                                      subfields=subfields)
                 except IndexError:
                     #references without 'mixed-citation' tag
                     try:
                         r = ref.getElementsByTagName('note')[0]
                         subfields.append(('s', xml_to_text(r)))
                         record_add_field(rec,
                                          '999',
                                          ind1='C',
                                          ind2='5',
                                          subfields=subfields)
                     except IndexError:
                         #references without 'note' tag
                         subfields.append(('s', xml_to_text(ref)))
                         record_add_field(rec,
                                          '999',
                                          ind1='C',
                                          ind2='5',
                                          subfields=subfields)
             else:
                 record_add_field(rec,
                                  '999',
                                  ind1='C',
                                  ind2='5',
                                  subfields=subfields)
Exemple #16
0
    def get_record(self, fileName, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_jp directory

        :param fileName: the name of the file to parse.
        :type fileName: string
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(fileName)
        article_type = self._get_article_type()
        if article_type not in ['research-article', 'introduction', 'letter']:
            return ''
        rec = create_record()
        title, subtitle, notes = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        subjects = self.document.getElementsByTagName('kwd')
        subjects = map(xml_to_text, subjects)
        for note_id in notes:
            note = self._get_note(note_id)
            if note:
                record_add_field(rec, '500', subfields=[('a', note)])
        for subject in subjects:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'EDPSciences'), ('a', subject)])
        keywords = self._get_keywords()
        for keyword in keywords:
            record_add_field(rec,
                             '653',
                             ind1='1',
                             subfields=[('a', keyword), ('9', 'author')])
        journal, volume, issue, year, date, doi, page,\
            fpage, lpage = self._get_publication_information()
        astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.']
        if journal in astronomy_journals:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'INSPIRE'),
                                        ('a', 'Astrophysics')])
        if date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        abstract = self._get_abstract()
        abstract = self._format_abstract(abstract)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'EDPSciences')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        if license_type == 'open-access':
            self._attach_fulltext(rec, doi)
        number_of_pages = self._get_page_count()
        if number_of_pages:
            record_add_field(rec, '300', subfields=[('a', number_of_pages)])
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec,
                             '542',
                             subfields=[('d', c_holder), ('g', c_year),
                                        ('e', 'Article')])
        elif c_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', c_statement), ('e', 'Article')])
        subfields = []
        if journal:
            subfields.append(('p', journal))
        if issue:
            subfields.append(('n', issue))
        if volume:
            subfields.append(('v', volume))
        if fpage and lpage:
            subfields.append(('c', '%s-%s' % (fpage, lpage)))
        elif page:
            subfields.append(('c', page))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        conference = ''
        for tag in self.document.getElementsByTagName('conference'):
            conference = xml_to_text(tag)
        if conference:
            record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')])
            record_add_field(rec, '500', subfields=[('a', conference)])
        self._add_references(rec, ref_extract_callback)
        self._add_authors(rec)
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
    def get_record(self, path=None, no_pdf=False,
                   test=False, refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec, '260', subfields=[('c', start_date),
                                                    ('t', 'published')])
        else:
            record_add_field(
                rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec, '540', subfields=[('a', license),
                                                    ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get(
                'given_name') or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec, '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            if keywords:
                for keyword in keywords:
                    record_add_field(rec, '653', ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec, '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' %
                                       (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec, '856', ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec, 'FFT', subfields=[('a', path),
                                                            ('t', 'INSPIRE-PUBLIC'),
                                                            ('d', 'Fulltext')])
                else:
                    record_add_field(rec, 'FFT', subfields=[('a', path),
                                                            ('t', 'Elsevier'),
                                                            ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(
                        rec, '653', ind1='1', subfields=[('a', keyword),
                                    ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,)
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(
                            ".pdf;pdfa", exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec, 'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: '
                                   + doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec, 'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s'
                                          % (doi,))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec, 'FFT', subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi,)
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec, '980', subfields=[('a', 'SCOAP3'),
                                                        ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
 def _add_references(self, xml_doc, rec, refextract_callback=None):
     for label, authors, doi, issue, page, title, volume, year,\
             textref, ext_link, isjournal, comment, journal, publisher,\
             editors, book_title in self.get_references(xml_doc):
         subfields = []
         if textref and not authors:
             textref = textref.replace('\"', '\'')
             if refextract_callback:
                 ref_xml = refextract_callback(textref)
                 dom = xml.dom.minidom.parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     if code == 'r':
                         data = fix_dashes(data)
                     subfields.append((code, data))
                 if fields:
                     subfields.append(('9', 'refextract'))
             else:
                 subfields.append(('m', textref))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if subfields:
                 record_add_field(rec, '999', ind1='C', ind2='5',
                                  subfields=subfields)
         else:
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             if ext_link:
                 ext_link = fix_dashes(ext_link)
                 subfields.append(('r', ext_link))
             if title:
                 subfields.append(('t', title))
             elif textref:
                 subfields.append(('m', textref))
             if publisher:
                 subfields.append(('p', publisher))
             if volume:
                 subfields.append(('v', volume))
             if year:
                 subfields.append(('y', year))
             if comment:
                 subfields.append(('m', comment))
             for editor in editors:
                 subfields.append(('e', editor))
             if book_title:
                 subfields.append(('q', book_title))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if journal:
                 journal, vol = fix_journal_name(journal,
                                                 self.journal_mappings)
                 volume = vol + volume
                 if volume and page:
                     journal = journal + "," + volume + "," + page
                     subfields.append(('s', journal))
                 elif volume:
                     journal = journal + "," + volume
                     subfields.append(('s', journal))
                 else:
                     subfields.append(('s', journal))
             if textref:
                 subfields.append(('m', textref))
             if subfields:
                 record_add_field(rec, '999', ind1='C', ind2='5',
                                  subfields=subfields)
    def get_record_rich(self, filename, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName('ArticleID')
        for article in articles:
            article_type = article.getAttribute('Type')
            if not article_type == 'Article':
                return ''
            doi = get_value_in_tag(self.document, 'DOI')
            date = ''
            for tag in self.document.getElementsByTagName('Accepted'):
                year = get_value_in_tag(tag, 'Year')
                month = get_value_in_tag(tag, 'Month').zfill(2)
                day = get_value_in_tag(tag, 'Day').zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName('OnlineDate'):
                    year = get_value_in_tag(tag, 'Year')
                    month = get_value_in_tag(tag, 'Month').zfill(2)
                    day = get_value_in_tag(tag, 'Day').zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, 'FirstPage')
            last_page = get_value_in_tag(article, 'LastPage')
            subjects = article.getElementsByTagName('Keyword')
            subjects = map(xml_to_text, subjects)
            subject = ', '.join(subjects)
            copyright_statement = get_value_in_tag(article, 'Copyright')
        journal = get_value_in_tag(self.document, 'JournalTitle')
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName('IssueID')
        for issue in issues:
            volume += get_value_in_tag(issue, 'Volume')
            year = get_value_in_tag(issue, 'Year')
        title = get_value_in_tag(self.document, 'Title')
        authors = self.document.getElementsByTagName('Author')
        affiliations = self.document.getElementsByTagName('Affiliation')

        def affiliation_pair(a):
            return a.getAttribute('ID'), get_value_in_tag(
                a, 'UnstructuredAffiliation'
            )

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, 'LastName')
            first_name = get_value_in_tag(a, 'FirstName')
            middle_name = get_value_in_tag(a, 'MiddleName')
            if middle_name:
                name = '%s, %s %s' % (surname, first_name, middle_name)
            else:
                name = '%s, %s' % (surname, first_name)
            try:
                affid = a.getElementsByTagName(
                    'AffiliationID'
                )[0].getAttribute('Label')
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ''
            except KeyError:
                affiliation = ''
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, 'Abstract')
        references = self.document.getElementsByTagName('Bibliomixed')

        for reference in references:
            subfields = []
            label = reference.getAttribute('N')
            if label:
                subfields.append(('o', label))
            bibliosets = reference.getElementsByTagName('Biblioset')
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, 'Date')
                ref_journal = get_value_in_tag(tag, 'JournalShortTitle')
                ref_journal, ref_volume = fix_journal_name(
                    ref_journal, self.journal_mappings
                )
                ref_volume += get_value_in_tag(tag, 'Volume')
                ref_page = get_value_in_tag(tag, 'ArtPageNums')
                if ref_year:
                    subfields.append(('y', ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(('s', '%s,%s,%s' % (ref_journal,
                                                         ref_volume,
                                                         ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            if ref_extract_callback:
                ref_xml = ref_extract_callback(text_ref)
                dom = parseString(ref_xml)
                fields = dom.getElementsByTagName("datafield")[0]
                fields = fields.getElementsByTagName("subfield")
                if fields:
                    subfields.append(('9', 'refextract'))
                for field in fields:
                    data = field.firstChild.data
                    code = field.getAttribute("code")
                    if code == 'm' and bibliosets:
                        continue
                    else:
                        subfields.append((code, data))
            else:
                subfields.append(('m', text_ref))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5',
                                 subfields=subfields)

        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if date:
            record_add_field(rec, '260', subfields=[('c', date),
                                                    ('t', 'published')])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'EDPSciences')])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '700', subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(('s', "%s,%s,%s" % (journal,
                                                 volume,
                                                 first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec, '300',
                                 subfields=[('a', str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(('c', '%s-%s' % (first_page,
                                              last_page)))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        if copyright_statement:
            record_add_field(rec, '542',
                             subfields=[('f', copyright_statement)])
        if subject:
            record_add_field(rec, '650', ind1='1', ind2='7',
                             subfields=[('2', 'EDPSciences'),
                                        ('a', subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Exemple #20
0
 def get_record(self, record):
     """ Reads a dom xml element in oaidc format and
         returns the bibrecord object """
     self.document = record
     rec = create_record()
     language = self._get_language()
     if language and language != 'en':
         record_add_field(rec, '041', subfields=[('a', language)])
     publisher = self._get_publisher()
     date = self._get_date()
     if publisher and date:
         record_add_field(rec,
                          '260',
                          subfields=[('b', publisher), ('c', date)])
     elif publisher:
         record_add_field(rec, '260', subfields=[('b', publisher)])
     elif date:
         record_add_field(rec, '260', subfields=[('c', date)])
     title = self._get_title()
     if title:
         record_add_field(rec, '245', subfields=[('a', title)])
     record_copyright = self._get_copyright()
     if record_copyright:
         record_add_field(rec, '540', subfields=[('a', record_copyright)])
     subject = self._get_subject()
     if subject:
         record_add_field(rec,
                          '650',
                          ind1='1',
                          ind2='7',
                          subfields=[('a', subject), ('2', 'PoS')])
     authors = self._get_authors()
     first_author = True
     for author in authors:
         subfields = [('a', author[0])]
         for affiliation in author[1]:
             subfields.append(('v', affiliation))
         if first_author:
             record_add_field(rec, '100', subfields=subfields)
             first_author = False
         else:
             record_add_field(rec, '700', subfields=subfields)
     identifier = self.get_identifier()
     conference = identifier.split(':')[2]
     conference = conference.split('/')[0]
     contribution = identifier.split(':')[2]
     contribution = contribution.split('/')[1]
     record_add_field(rec,
                      '773',
                      subfields=[('p', 'PoS'),
                                 ('v', conference.replace(' ', '')),
                                 ('c', contribution), ('y', date[:4])])
     record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')])
     record_add_field(rec, '980', subfields=[('a', 'HEP')])
     return rec
 def _add_references(self, rec):
     """ Adds the reference to the record """
     for ref in self.document.getElementsByTagName('ref'):
         for ref_type, doi, authors, collaboration, journal, volume, page, year,\
                 label, arxiv, publisher, institution, unstructured_text,\
                 external_link, report_no, editors in self._get_reference(ref):
             subfields = []
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             for editor in editors:
                 subfields.append(('e', editor))
             if year:
                 subfields.append(('y', year))
             if unstructured_text:
                 if page:
                     subfields.append(('m', unstructured_text + ', ' + page))
                 else:
                     subfields.append(('m', unstructured_text))
             if collaboration:
                 subfields.append(('c', collaboration))
             if institution:
                 subfields.append(('m', institution))
             if publisher:
                 subfields.append(('p', publisher))
             if arxiv:
                 subfields.append(('r', arxiv))
             if report_no:
                 subfields.append(('r', report_no))
             if external_link:
                 subfields.append(('u', external_link))
             if label:
                 subfields.append(('o', label))
             if ref_type == 'book':
                 if journal:
                     subfields.append(('t', journal))
                 if volume:
                     subfields.append(('m', volume))
                 elif page and not unstructured_text:
                     subfields.append(('m', page))
             else:
                 if volume and page:
                     subfields.append(('s', journal + "," + volume + "," + page))
                 elif journal:
                     subfields.append(('t', journal))
             if ref_type:
                 subfields.append(('d', ref_type))
             if not subfields:
                 #misc-type references
                 try:
                     r = ref.getElementsByTagName('mixed-citation')[0]
                     text = xml_to_text(r)
                     label = text.split()[0]
                     text = " ".join(text.split()[1:])
                     subfields.append(('s', text))
                     record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
                 except IndexError:
                     #references without 'mixed-citation' tag
                     try:
                         r = ref.getElementsByTagName('note')[0]
                         subfields.append(('s', xml_to_text(r)))
                         record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
                     except IndexError:
                         #references without 'note' tag
                         subfields.append(('s', xml_to_text(ref)))
                         record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
             else:
                 record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
Exemple #22
0
 def get_record(self, record):
     """ Reads a dom xml element in oaidc format and
         returns the bibrecord object """
     self.document = record
     rec = create_record()
     language = self._get_language()
     if language and language != 'en':
         record_add_field(rec, '041', subfields=[('a', language)])
     publisher = self._get_publisher()
     date = self._get_date()
     if publisher and date:
         record_add_field(rec, '260', subfields=[('b', publisher),
                                                 ('c', date)])
     elif publisher:
         record_add_field(rec, '260', subfields=[('b', publisher)])
     elif date:
         record_add_field(rec, '260', subfields=[('c', date)])
     title = self._get_title()
     if title:
         record_add_field(rec, '245', subfields=[('a', title)])
     record_copyright = self._get_copyright()
     if record_copyright:
         record_add_field(rec, '540', subfields=[('a', record_copyright)])
     subject = self._get_subject()
     if subject:
         record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('a', subject),
                                                                     ('2', 'PoS')])
     authors = self._get_authors()
     first_author = True
     for author in authors:
         subfields = [('a', author[0])]
         for affiliation in author[1]:
             subfields.append(('v', affiliation))
         if first_author:
             record_add_field(rec, '100', subfields=subfields)
             first_author = False
         else:
             record_add_field(rec, '700', subfields=subfields)
     identifier = self.get_identifier()
     conference = identifier.split(':')[2]
     conference = conference.split('/')[0]
     contribution = identifier.split(':')[2]
     contribution = contribution.split('/')[1]
     record_add_field(rec, '773', subfields=[('p', 'PoS'),
                                             ('v', conference.replace(' ', '')),
                                             ('c', contribution),
                                             ('y', date[:4])])
     record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')])
     record_add_field(rec, '980', subfields=[('a', 'HEP')])
     return rec
Exemple #23
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = super(NLMParser, self).get_article(f_path)
        rec = create_record()
        title = super(NLMParser, self).get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml)
        journal = "PTEP"  # Let's override the journal information

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        page_count = super(NLMParser, self).get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        arxiv = self.get_arxiv_id(xml)
        if arxiv:
            record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))])
        authors = super(NLMParser, self).get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = super(NLMParser, self).get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = super(NLMParser, self).get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = super(NLMParser, self).get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])

        ## Oxford is giving us bad keywords. Better ignore them.
        #if keywords['other']:
            #for keyword in keywords['other']:
                #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        f_path_pdf = f_path[:-(len('.xml'))] + '.pdf'
        f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf')
        if exists(f_path_pdf):
            record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,))
                logger.warning("Record %s doesn't contain PDF file." % (doi,))
        if exists(f_path_pdfa):
            record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,))
                logger.warning("Record %s doesn't contain PDF/A file." % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Exemple #24
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = super(NLMParser, self).get_article(f_path)
        rec = create_record()
        title = super(NLMParser, self).get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        record_add_field(rec,
                         '260',
                         subfields=[
                             ('c',
                              super(NLMParser,
                                    self).get_publication_date(xml, logger))
                         ])
        journal, issn, volume, issue, first_page, last_page, year, doi = super(
            NLMParser, self).get_publication_information(xml)
        journal = "PTEP"  # Let's override the journal information

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        page_count = super(NLMParser, self).get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        arxiv = self.get_arxiv_id(xml)
        if arxiv:
            record_add_field(rec,
                             '037',
                             subfields=[('9', 'arXiv'),
                                        ('a', format_arxiv_id(arxiv))])
        authors = super(NLMParser, self).get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' %
                              (author.get('surname'), author.get('given_name')
                               or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = super(NLMParser, self).get_abstract(xml)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec,
                         '540',
                         subfields=[
                             ('a', 'CC-BY-3.0'),
                             ('u',
                              'http://creativecommons.org/licenses/by/3.0/')
                         ])
        copyright = super(NLMParser, self).get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = super(NLMParser, self).get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec,
                                 '084',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'PACS')])

        ## Oxford is giving us bad keywords. Better ignore them.
        #if keywords['other']:
        #for keyword in keywords['other']:
        #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(
                    ('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(
                    ('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec,
                                 '999',
                                 ind1='C',
                                 ind2='5',
                                 subfields=subfields)
        f_path_pdf = f_path[:-(len('.xml'))] + '.pdf'
        f_path_pdfa = join(dirname(f_path), 'archival_pdfs',
                           basename(f_path)[:-len('.xml')] + '-hires.pdf')
        if exists(f_path_pdf):
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', f_path_pdf), ('n', 'main')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True,
                                   prefix="Oxford paper: %s is missing PDF." %
                                   (doi, ))
                logger.warning("Record %s doesn't contain PDF file." % (doi, ))
        if exists(f_path_pdfa):
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', f_path_pdfa), ('n', 'main'),
                                        ('f', '.pdf;pdfa')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(
                    alert_admin=True,
                    prefix="Oxford paper: %s is missing PDF/A." % (doi, ))
                logger.warning("Record %s doesn't contain PDF/A file." %
                               (doi, ))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Exemple #25
0
    def get_record_rich(self, filename, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_rich directory

        :param fileName: the name of the file to parse.
        :type fileName: string

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)
        rec = create_record()
        articles = self.document.getElementsByTagName('ArticleID')
        for article in articles:
            article_type = article.getAttribute('Type')
            if not article_type == 'Article':
                return ''
            doi = get_value_in_tag(self.document, 'DOI')
            date = ''
            for tag in self.document.getElementsByTagName('Accepted'):
                year = get_value_in_tag(tag, 'Year')
                month = get_value_in_tag(tag, 'Month').zfill(2)
                day = get_value_in_tag(tag, 'Day').zfill(2)
                date = "%s-%s-%s" % (year, month, day)
            if not date:
                for tag in self.document.getElementsByTagName('OnlineDate'):
                    year = get_value_in_tag(tag, 'Year')
                    month = get_value_in_tag(tag, 'Month').zfill(2)
                    day = get_value_in_tag(tag, 'Day').zfill(2)
                    date = "%s-%s-%s" % (year, month, day)
            first_page = get_value_in_tag(article, 'FirstPage')
            last_page = get_value_in_tag(article, 'LastPage')
            subjects = article.getElementsByTagName('Keyword')
            subjects = map(xml_to_text, subjects)
            subject = ', '.join(subjects)
            copyright_statement = get_value_in_tag(article, 'Copyright')
        journal = get_value_in_tag(self.document, 'JournalTitle')
        journal, volume = fix_journal_name(journal, self.journal_mappings)
        issues = self.document.getElementsByTagName('IssueID')
        for issue in issues:
            volume += get_value_in_tag(issue, 'Volume')
            year = get_value_in_tag(issue, 'Year')
        title = get_value_in_tag(self.document, 'Title')
        authors = self.document.getElementsByTagName('Author')
        affiliations = self.document.getElementsByTagName('Affiliation')

        def affiliation_pair(a):
            return a.getAttribute('ID'), get_value_in_tag(
                a, 'UnstructuredAffiliation')

        affiliations = map(affiliation_pair, affiliations)
        affiliations = dict(affiliations)

        def author_pair(a):
            surname = get_value_in_tag(a, 'LastName')
            first_name = get_value_in_tag(a, 'FirstName')
            middle_name = get_value_in_tag(a, 'MiddleName')
            if middle_name:
                name = '%s, %s %s' % (surname, first_name, middle_name)
            else:
                name = '%s, %s' % (surname, first_name)
            try:
                affid = a.getElementsByTagName(
                    'AffiliationID')[0].getAttribute('Label')
                affiliation = affiliations[affid]
            except IndexError:
                affiliation = ''
            except KeyError:
                affiliation = ''
            return name, affiliation

        authors = map(author_pair, authors)
        abstract = get_value_in_tag(self.document, 'Abstract')
        references = self.document.getElementsByTagName('Bibliomixed')

        for reference in references:
            subfields = []
            label = reference.getAttribute('N')
            if label:
                subfields.append(('o', label))
            bibliosets = reference.getElementsByTagName('Biblioset')
            for tag in bibliosets:
                ref_year = get_value_in_tag(tag, 'Date')
                ref_journal = get_value_in_tag(tag, 'JournalShortTitle')
                ref_journal, ref_volume = fix_journal_name(
                    ref_journal, self.journal_mappings)
                ref_volume += get_value_in_tag(tag, 'Volume')
                ref_page = get_value_in_tag(tag, 'ArtPageNums')
                if ref_year:
                    subfields.append(('y', ref_year))
                if ref_journal and ref_volume and ref_page:
                    subfields.append(
                        ('s',
                         '%s,%s,%s' % (ref_journal, ref_volume, ref_page)))
                reference.removeChild(tag)
            text_ref = xml_to_text(reference)
            if ref_extract_callback:
                ref_xml = ref_extract_callback(text_ref)
                dom = parseString(ref_xml)
                fields = dom.getElementsByTagName("datafield")[0]
                fields = fields.getElementsByTagName("subfield")
                if fields:
                    subfields.append(('9', 'refextract'))
                for field in fields:
                    data = field.firstChild.data
                    code = field.getAttribute("code")
                    if code == 'm' and bibliosets:
                        continue
                    else:
                        subfields.append((code, data))
            else:
                subfields.append(('m', text_ref))
            if subfields:
                record_add_field(rec,
                                 '999',
                                 ind1='C',
                                 ind2='5',
                                 subfields=subfields)

        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        if date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'EDPSciences')])
        first_author = True
        for author in authors:
            if first_author:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                subfields = [('a', author[0])]
                if author[1]:
                    subfields.append(('v', author[1]))
                record_add_field(rec, '700', subfields=subfields)
        subfields = []
        if journal and volume and first_page:
            subfields.append(('s', "%s,%s,%s" % (journal, volume, first_page)))
        if first_page and last_page:
            try:
                nuber_of_pages = int(last_page) - int(first_page)
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(nuber_of_pages))])
            except ValueError:
                pass
            subfields.append(('c', '%s-%s' % (first_page, last_page)))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        if copyright_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', copyright_statement)])
        if subject:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'EDPSciences'), ('a', subject)])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Exemple #26
0
    def get_record(self, xml_file):
        """ Reads a xml file in JATS format and returns
            a xml string in marc format """
        self.document = parse(xml_file)

        if get_value_in_tag(self.document, "meta"):
            raise ApsPackageXMLError("The XML format of %s is not correct" %
                                     (xml_file, ))
        page_count = self._get_page_count()
        rec = create_record()
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        pacscodes = self._get_pacscodes()
        for pacscode in pacscodes:
            record_add_field(rec,
                             '084',
                             subfields=[('2', 'PACS'), ('a', pacscode)])
        subject = self._get_subject()
        if subject:
            record_add_field(rec,
                             '650',
                             ind1='1',
                             ind2='7',
                             subfields=[('2', 'APS'), ('a', subject)])
        keywords = self._get_keywords()
        if keywords:
            record_add_field(rec,
                             '653',
                             ind1='1',
                             subfields=[('a', ', '.join(keywords)),
                                        ('9', 'author')])
        title, subtitle, _ = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        journal, volume, issue, year, start_date, doi,\
            article_id, _, _ = self._get_publication_information()
        if start_date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', start_date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        abstract = self._get_abstract()
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'APS')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        c_holder, c_year, c_statement = self._get_copyright()
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec,
                             '542',
                             subfields=[('d', c_holder), ('g', c_year),
                                        ('e', 'Article')])
        elif c_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', c_statement), ('e', 'Article')])
        record_add_field(rec,
                         '773',
                         subfields=[('p', journal),
                                    ('v', volume), ('n', issue), ('y', year),
                                    ('c', article_id)])
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        record_add_field(rec, '980', subfields=[('a', 'Citeable')])
        record_add_field(rec, '980', subfields=[('a', 'Published')])
        self._add_authors(rec)
        self._add_references(rec)
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            sys.stderr.write("""Found a bad char in the file
                                for the article """ + doi)
            return ""
Exemple #27
0
    def get_record(self, filename, ref_extract_callback=None):
        """Get the MARCXML of the files in xaml_jp directory.

        :param filename: the name of the file to parse.
        :type filename: string
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)

        article_type = self._get_article_type()
        if article_type not in [
                'research-article', 'corrected-article', 'original-article',
                'introduction', 'letter', 'correction', 'addendum',
                'review-article', 'rapid-communications'
        ]:
            return ""

        rec = create_record()
        title, subtitle, notes = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            title = fix_title_capitalization(title)
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        for note_id in notes:
            note = self._get_note(note_id)
            if note:
                record_add_field(rec, '500', subfields=[('a', note)])
        keywords = self._get_keywords()
        for keyword in keywords:
            record_add_field(rec,
                             '653',
                             ind1='1',
                             subfields=[('a', keyword), ('9', 'author')])
        journal, volume, issue, year, date, doi, page,\
            fpage, lpage = self._get_publication_information()
        if date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', date), ('t', 'published')])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        abstract = self._get_abstract()
        if abstract:
            abstract = convert_html_subscripts_to_latex(abstract)
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract),
                                        ('9', 'World Scientific')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        if license_type == 'open-access':
            self._attach_fulltext(rec, doi)
        number_of_pages = self._get_page_count()
        if number_of_pages:
            record_add_field(rec, '300', subfields=[('a', number_of_pages)])
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec,
                             '542',
                             subfields=[('d', c_holder), ('g', c_year),
                                        ('e', 'Article')])
        elif c_statement:
            record_add_field(rec,
                             '542',
                             subfields=[('f', c_statement), ('e', 'Article')])
        subfields = []
        if journal:
            subfields.append(('p', journal))
        if issue:
            subfields.append(('n', issue))
        if volume:
            subfields.append(('v', volume))
        if fpage and lpage:
            subfields.append(('c', '%s-%s' % (fpage, lpage)))
        elif page:
            subfields.append(('c', page))
        if year:
            subfields.append(('y', year))
        if article_type == 'correction':
            subfields.append(('m', 'Erratum'))
        elif article_type == 'addendum':
            subfields.append(('m', 'Addendum'))
        record_add_field(rec, '773', subfields=subfields)

        collections = self.get_collection(journal)
        for collection in collections:
            record_add_field(rec, '980', subfields=[collection])

        self._add_authors(rec)
        if article_type in ['correction', 'addendum']:
            related_article = self._get_related_article()
            if related_article:
                record_add_field(rec,
                                 '024',
                                 ind1='7',
                                 subfields=[('a', related_article),
                                            ('2', 'DOI')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Exemple #28
0
def main(args):
    """Run the filtering."""
    if len(args) != 1:
        print("Usage: python bibfilter_oaipos2inspire.py input_filename")
        sys.exit(1)
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query, ))
        results = perform_request_search(p=query, of="id")

        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec,
                                 '856',
                                 ind1='4',
                                 subfields=[('u', url), ('y', 'PoS server')])
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', filename), ('t', 'PoS'),
                                            ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url, ))
                break
        if not found:
            error_records.append(rec)

        # upload to FTP
        if not CFG_POS_DEBUG:
            tempfile_path = join(mkdtemp(), "{0}.xml".format(contribution))
            with open(tempfile_path, 'w') as tempfile:
                tempfile.write(record_xml_output(rec))
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename, )
    append_filename = "%s.append.xml" % (input_filename, )
    errors_filename = "%s.errors.xml" % (input_filename, )

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, out_folder)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(
        error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl, )
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
Exemple #29
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])

        record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml)

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        page_count = self.get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = self.get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])
        if keywords['other']:
            for keyword in keywords['other']:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
        pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf')
        try:
            open(pdf_path)
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        except:
            register_exception(alert_admin=True)
            logger.error("No PDF for paper: %s" % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
def main(args):
    """Run the filtering."""
    if len(args) != 1:
        print("Usage: python bibfilter_oaipos2inspire.py input_filename")
        sys.exit(1)
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query,))
        results = perform_request_search(p=query, of="id")

        url = urljoin(base_url, '/contribution?id=%s' % identifier)
        session = requests.session()
        try:
            r = session.get(url, timeout=60)
        except (ConnectionError, Timeout):
            register_exception()
            error_records.append(rec)
            continue
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('/pdf'):
                # handle relative URLs
                url = urljoin(base_url, url)
                found = True
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec, '856', ind1='4', subfields=[
                    ('u', url),
                    ('y', 'PoS server')
                ])
                record_add_field(rec, 'FFT', subfields=[('a', filename),
                                                        ('t', 'PoS'),
                                                        ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url,))
                break
        if not found:
            error_records.append(rec)

        # upload to FTP
        if not CFG_POS_DEBUG:
            tempfile_path = join(mkdtemp(), "{0}.xml".format(contribution))
            with open(tempfile_path, 'w') as tempfile:
                tempfile.write(record_xml_output(rec))
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename,)
    append_filename = "%s.append.xml" % (input_filename,)
    errors_filename = "%s.errors.xml" % (input_filename,)

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, out_folder)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl,)
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL,
                      CFG_POSHARVEST_EMAIL,
                      subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
    def get_record(self, fileName, ref_extract_callback=None):
        """
        Gets the Marc xml of the files in xaml_jp directory

        :param fileName: the name of the file to parse.
        :type fileName: string
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(fileName)
        article_type = self._get_article_type()
        if article_type not in ['research-article',
                                'introduction',
                                'letter']:
            return ''
        rec = create_record()
        title, subtitle, notes = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        subjects = self.document.getElementsByTagName('kwd')
        subjects = map(xml_to_text, subjects)
        for note_id in notes:
            note = self._get_note(note_id)
            if note:
                record_add_field(rec, '500', subfields=[('a', note)])
        for subject in subjects:
            record_add_field(rec, '650', ind1='1', ind2='7',
                             subfields=[('2', 'EDPSciences'),
                                        ('a', subject)])
        keywords = self._get_keywords()
        for keyword in keywords:
            record_add_field(rec, '653', ind1='1', subfields=[('a', keyword),
                                                              ('9', 'author')])
        journal, volume, issue, year, date, doi, page,\
            fpage, lpage = self._get_publication_information()
        astronomy_journals = ['EAS Publ.Ser.', 'Astron.Astrophys.']
        if journal in astronomy_journals:
            record_add_field(rec, '650', ind1='1', ind2='7',
                             subfields=[('2', 'INSPIRE'),
                                        ('a', 'Astrophysics')])
        if date:
            record_add_field(rec, '260', subfields=[('c', date),
                                                    ('t', 'published')])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        abstract = self._get_abstract()
        abstract = self._format_abstract(abstract)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'EDPSciences')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        if license_type == 'open-access':
            self._attach_fulltext(rec, doi)
        number_of_pages = self._get_page_count()
        if number_of_pages:
            record_add_field(rec, '300', subfields=[('a', number_of_pages)])
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec, '542', subfields=[('d', c_holder),
                                                    ('g', c_year),
                                                    ('e', 'Article')])
        elif c_statement:
            record_add_field(rec, '542', subfields=[('f', c_statement),
                                                    ('e', 'Article')])
        subfields = []
        if journal:
            subfields.append(('p', journal))
        if issue:
            subfields.append(('n', issue))
        if volume:
            subfields.append(('v', volume))
        if fpage and lpage:
            subfields.append(('c', '%s-%s' % (fpage,
                                              lpage)))
        elif page:
            subfields.append(('c', page))
        if year:
            subfields.append(('y', year))
        record_add_field(rec, '773', subfields=subfields)
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        conference = ''
        for tag in self.document.getElementsByTagName('conference'):
            conference = xml_to_text(tag)
        if conference:
            record_add_field(rec, '980', subfields=[('a', 'ConferencePaper')])
            record_add_field(rec, '500', subfields=[('a', conference)])
        self._add_references(rec, ref_extract_callback)
        self._add_authors(rec)
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Exemple #32
0
 def _add_references(self, xml_doc, rec, refextract_callback=None):
     for label, authors, doi, issue, page, title, volume, year,\
             textref, ext_link, isjournal, comment, journal, publisher,\
             editors, book_title in self.get_references(xml_doc):
         subfields = []
         if textref and not authors:
             textref = textref.replace('\"', '\'')
             if refextract_callback:
                 ref_xml = refextract_callback(textref)
                 dom = xml.dom.minidom.parseString(ref_xml)
                 fields = dom.getElementsByTagName("datafield")[0]
                 fields = fields.getElementsByTagName("subfield")
                 for field in fields:
                     data = field.firstChild.data
                     code = field.getAttribute("code")
                     if code == 'r':
                         data = fix_dashes(data)
                     subfields.append((code, data))
                 if fields:
                     subfields.append(('9', 'refextract'))
             else:
                 subfields.append(('m', textref))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if subfields:
                 record_add_field(rec,
                                  '999',
                                  ind1='C',
                                  ind2='5',
                                  subfields=subfields)
         else:
             if doi:
                 subfields.append(('a', doi))
             for author in authors:
                 subfields.append(('h', author))
             if ext_link:
                 ext_link = fix_dashes(ext_link)
                 subfields.append(('r', ext_link))
             if title:
                 subfields.append(('t', title))
             elif textref:
                 subfields.append(('m', textref))
             if publisher:
                 subfields.append(('p', publisher))
             if volume:
                 subfields.append(('v', volume))
             if year:
                 subfields.append(('y', year))
             if comment:
                 subfields.append(('m', comment))
             for editor in editors:
                 subfields.append(('e', editor))
             if book_title:
                 subfields.append(('q', book_title))
             if label:
                 label = re.sub("[\[\].)]", "", label)
                 subfields.append(('o', label))
             if journal:
                 journal, vol = fix_journal_name(journal,
                                                 self.journal_mappings)
                 volume = vol + volume
                 if volume and page:
                     journal = journal + "," + volume + "," + page
                     subfields.append(('s', journal))
                 elif volume:
                     journal = journal + "," + volume
                     subfields.append(('s', journal))
                 else:
                     subfields.append(('s', journal))
             if textref:
                 subfields.append(('m', textref))
             if subfields:
                 record_add_field(rec,
                                  '999',
                                  ind1='C',
                                  ind2='5',
                                  subfields=subfields)
    def get_record(self, filename, ref_extract_callback=None):
        """Get the MARCXML of the files in xaml_jp directory.

        :param filename: the name of the file to parse.
        :type filename: string
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: a string with the marc xml version of the file.
        """
        self.document = parse(filename)

        article_type = self._get_article_type()
        if article_type not in ['research-article',
                                'corrected-article',
                                'original-article',
                                'introduction',
                                'letter',
                                'correction',
                                'addendum',
                                'review-article',
                                'rapid-communications']:
            return ""

        rec = create_record()
        title, subtitle, notes = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            title = fix_title_capitalization(title)
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        for note_id in notes:
            note = self._get_note(note_id)
            if note:
                record_add_field(rec, '500', subfields=[('a', note)])
        keywords = self._get_keywords()
        for keyword in keywords:
            record_add_field(rec, '653', ind1='1', subfields=[('a', keyword),
                                                              ('9', 'author')])
        journal, volume, issue, year, date, doi, page,\
            fpage, lpage = self._get_publication_information()
        if date:
            record_add_field(rec, '260', subfields=[('c', date),
                                                    ('t', 'published')])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        abstract = self._get_abstract()
        if abstract:
            abstract = convert_html_subscripts_to_latex(abstract)
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'World Scientific')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        if license_type == 'open-access':
            self._attach_fulltext(rec, doi)
        number_of_pages = self._get_page_count()
        if number_of_pages:
            record_add_field(rec, '300', subfields=[('a', number_of_pages)])
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec, '542', subfields=[('d', c_holder),
                                                    ('g', c_year),
                                                    ('e', 'Article')])
        elif c_statement:
            record_add_field(rec, '542', subfields=[('f', c_statement),
                                                    ('e', 'Article')])
        subfields = []
        if journal:
            subfields.append(('p', journal))
        if issue:
            subfields.append(('n', issue))
        if volume:
            subfields.append(('v', volume))
        if fpage and lpage:
            subfields.append(('c', '%s-%s' % (fpage,
                                              lpage)))
        elif page:
            subfields.append(('c', page))
        if year:
            subfields.append(('y', year))
        if article_type == 'correction':
            subfields.append(('m', 'Erratum'))
        elif article_type == 'addendum':
            subfields.append(('m', 'Addendum'))
        record_add_field(rec, '773', subfields=subfields)

        collections = self.get_collection(journal)
        for collection in collections:
            record_add_field(rec, '980', subfields=[collection])

        self._add_authors(rec)
        if article_type in ['correction',
                            'addendum']:
            related_article = self._get_related_article()
            if related_article:
                record_add_field(rec, '024', ind1='7', subfields=[('a', related_article),
                                                                  ('2', 'DOI')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Exemple #34
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        #path = abspath(join(f_path, pardir))
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        publication_date = self.get_publication_date(xml)
        if publication_date:
            record_add_field(rec, '260', subfields=[('c', publication_date)])
        journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml)
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        arxiv_id = self.get_arxiv_id(xml)
        if arxiv_id:
            record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')])
        if logger:
            logger.info("Creating record: %s %s" % (f_path, doi))
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')])
        copyright = self.get_copyright(xml)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords:
            for keyword in keywords:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        record_add_field(rec, "300", subfields=[('a', pages)])

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('c', first_page),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)
        references = self.get_references(xml)
        for label, authors, doi, issue, page, title, volume, year in references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if page:
                subfields.append(('p', page))
            subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            if title:
                subfields.append(('t', title))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)

        folder_name = join('/', *(f_path.split('/')[0:-1]))
        pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf'
        pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name)
        print pdf_path
        if exists(pdf_path):
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        else:
            # Don't know why it doesn't work????????????
            # register_exception(alert_admin=True)
            if logger:
                logger.error("Record %s doesn't contain PDF file." % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')])
        record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)])
        return record_xml_output(rec)
    def get_record(self, xml_file):
        """ Reads a xml file in JATS format and returns
            a xml string in marc format """
        self.document = parse(xml_file)

        if get_value_in_tag(self.document, "meta"):
            raise ApsPackageXMLError("The XML format of %s is not correct"
                                     % (xml_file,))
        page_count = self._get_page_count()
        rec = create_record()
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        pacscodes = self._get_pacscodes()
        for pacscode in pacscodes:
            record_add_field(rec, '084', subfields=[('2', 'PACS'),
                                                    ('a', pacscode)])
        subject = self._get_subject()
        if subject:
            record_add_field(rec, '650', ind1='1', ind2='7', subfields=[('2', 'APS'),
                                                                        ('a', subject)])
        keywords = self._get_keywords()
        if keywords:
            record_add_field(rec, '653', ind1='1', subfields=[('a', ', '.join(keywords)),
                                                              ('9', 'author')])
        title, subtitle, _ = self._get_title()
        subfields = []
        if subtitle:
            subfields.append(('b', subtitle))
        if title:
            subfields.append(('a', title))
            record_add_field(rec, '245', subfields=subfields)
        journal, volume, issue, year, start_date, doi,\
            article_id, _, _ = self._get_publication_information()
        if start_date:
            record_add_field(rec, '260', subfields=[('c', start_date),
                                                    ('t', 'published')])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        abstract = self._get_abstract()
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'APS')])
        license, license_type, license_url = self._get_license()
        subfields = []
        if license:
            subfields.append(('a', license))
        if license_url:
            subfields.append(('u', license_url))
        if subfields:
            record_add_field(rec, '540', subfields=subfields)
        c_holder, c_year, c_statement = self._get_copyright()
        c_holder, c_year, c_statement = self._get_copyright()
        if c_holder and c_year:
            record_add_field(rec, '542', subfields=[('d', c_holder),
                                                    ('g', c_year),
                                                    ('e', 'Article')])
        elif c_statement:
            record_add_field(rec, '542', subfields=[('f', c_statement),
                                                    ('e', 'Article')])
        record_add_field(rec, '773', subfields=[('p', journal),
                                                ('v', volume),
                                                ('n', issue),
                                                ('y', year),
                                                ('c', article_id)])
        record_add_field(rec, '980', subfields=[('a', 'HEP')])
        record_add_field(rec, '980', subfields=[('a', 'Citeable')])
        record_add_field(rec, '980', subfields=[('a', 'Published')])
        self._add_authors(rec)
        self._add_references(rec)
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            sys.stderr.write("""Found a bad char in the file
                                for the article """ + doi)
            return ""
Exemple #36
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])

        record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml)

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        page_count = self.get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = self.get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])
        if keywords['other']:
            for keyword in keywords['other']:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
        pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf')
        try:
            open(pdf_path)
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        except:
            register_exception(alert_admin=True)
            logger.error("No PDF for paper: %s" % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)