def check_records(records, empty=False):
    fields = ['100', '700']
    #filepath = "/opt/invenio/var/data/files/g0/"
    #filepath2 = "/opt/invenio/var/data/files/g1/"
    filepath = '/opt/invenio/var/data/files/'
    filepaths = os.listdir(filepath)

    for record in records:
        first_author = True
        if is_elsevier(record):
            doc_ids = get_doc_ids(int(record.record_id))
            for doc_id in doc_ids:
                # try:
                #     latest_file = get_latest_file(filepath + str(doc_id) + '/')
                # except:
                #     latest_file = get_latest_file(filepath2 + str(doc_id) + '/')
                latest_file = None
                for folder in filepaths:
                    try:
                        latest_file = get_latest_file(filepath + '/' + folder + '/'  + str(doc_id) + '/')
                        if latest_file:
                           break
                    except:
                        print "No folder with name %s in %s directory" % (doc_id, folder)

                try:
                    xml = parse(latest_file)
                except:
                    record.warn("Problem parssing XML file. Aborting")
                    break
                authors = get_authors(xml)

                delete_fields(record, fields)

                for author in authors:
                    field = '100' if first_author else '700'
                    first_author = False

                    subfields = []
                    author_name = (author['surname'], author.get(
                        'given_name') or author.get('initials'))
                    author_name = ('a', '%s, %s' % author_name)
                    subfields.append(author_name)

                    if 'orcid' in author:
                        subfields.append(('j', author['orcid']))

                    if 'affiliation' in author:
                        for aff in author["affiliation"]:
                            subfields.append(('v', aff))

                        add_nations_field(subfields)

                    if author.get('email'):
                        subfields.append(('m', author['email']))

                    record.add_field(field+'__',
                                     value='',
                                     subfields=subfields)
def check_records(records):
    for record in records:
        if is_springer(record):
            rec_doc = BibRecDocs(int(record.record_id))
            rec_docs = rec_doc.list_latest_files()
            for doc in rec_docs:
                if doc.get_format() == '.xml':
                    f = open(doc.get_full_path())
                    content = f.read()
                    try:
                        del record['100']
                        del record['700']
                        record.amended = True
                    except:
                        pass

                    first_author = True
                    try:
                        if "-//NLM//DTD JATS" in content:
                            jats = JATSParser()
                            authors = jats.get_authors(parseString(content))
                        else:
                            app = NLMParser()
                            authors = app.get_authors(parseString(content))
                    except:
                        record.warn('Problem with parsing XML.')
                        continue

                    for author in authors:
                        if author.get('surname'):
                            subfields = [
                                ('a',
                                 '%s, %s' % (author.get('surname'),
                                             author.get('given_name')
                                             or author.get('initials', '')))
                            ]
                        else:
                            subfields = [('a', '%s' % (author.get('name', '')))
                                         ]
                        if 'orcid' in author:
                            subfields.append(('j', author['orcid']))
                        if 'affiliation' in author:
                            for aff in author["affiliation"]:
                                subfields.append(('v', aff))

                        add_nations_field(subfields)

                        if author.get('email'):
                            subfields.append(('m', author['email']))
                        if first_author:
                            record.add_field('100__',
                                             value='',
                                             subfields=subfields)
                            first_author = False
                        else:
                            record.add_field('700__',
                                             value='',
                                             subfields=subfields)
def check_records(records):
    for record in records:
        if is_springer(record):
            rec_doc = BibRecDocs(int(record.record_id))
            rec_docs = rec_doc.list_latest_files()
            for doc in rec_docs:
                if doc.get_format() == '.xml':
                    f = open(doc.get_full_path())
                    content = f.read()
                    try:
                        del record['100']
                        del record['700']
                        record.amended = True
                    except:
                        pass

                    first_author = True
                    try:
                        if "-//NLM//DTD JATS" in content:
                            jats = JATSParser()
                            authors = jats.get_authors(parseString(content))
                        else:
                            app = NLMParser()
                            authors = app.get_authors(parseString(content))
                    except:
                        record.warn('Problem with parsing XML.')
                        continue

                    for author in authors:
                        if author.get('surname'):
                            subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
                        else:
                            subfields = [('a', '%s' % (author.get('name', '')))]
                        if 'orcid' in author:
                            subfields.append(('j', author['orcid']))
                        if 'affiliation' in author:
                            for aff in author["affiliation"]:
                                subfields.append(('v', aff))

                        add_nations_field(subfields)

                        if author.get('email'):
                                subfields.append(('m', author['email']))
                        if first_author:
                            record.add_field('100__', value='', subfields=subfields)
                            first_author = False
                        else:
                            record.add_field('700__', value='', subfields=subfields)
def check_records(records, empty=False):
    fields = ['100', '700']
    filepath = "/opt/invenio/var/data/files/g0/"
    first_author = True

    for record in records:
        if is_elsevier(record):
            doc_ids = get_doc_ids(int(record.record_id))
            for doc_id in doc_ids:
                latest_file = get_latest_file(filepath + str(doc_id) + '/')
                xml = parse(latest_file)
                authors = get_authors(xml)

                delete_fields(record, fields)

                for author in authors:
                    field = '100' if first_author else '700'
                    first_author = False

                    subfields = []
                    author_name = (author['surname'], author.get(
                        'given_name') or author.get('initials'))
                    author_name = ('a', '%s, %s' % author_name)
                    subfields.append(author_name)

                    if 'orcid' in author:
                        subfields.append(('j', author['orcid']))

                    if 'affiliation' in author:
                        for aff in author["affiliation"]:
                            subfields.append(('v', aff))

                        add_nations_field(subfields)

                    if author.get('email'):
                        subfields.append(('m', author['email']))

                    record.add_field(field+'__',
                                     value='',
                                     subfields=subfields)
Example #5
0
def check_records(records, empty=False):
    fields = ['100', '700']
    filepath = "/opt/invenio/var/data/files/g0/"
    first_author = True

    for record in records:
        if is_elsevier(record):
            doc_ids = get_doc_ids(int(record.record_id))
            for doc_id in doc_ids:
                latest_file = get_latest_file(filepath + str(doc_id) + '/')
                xml = parse(latest_file)
                authors = get_authors(xml)

                delete_fields(record, fields)

                for author in authors:
                    field = '100' if first_author else '700'
                    first_author = False

                    subfields = []
                    author_name = (author['surname'], author.get('given_name')
                                   or author.get('initials'))
                    author_name = ('a', '%s, %s' % author_name)
                    subfields.append(author_name)

                    if 'orcid' in author:
                        subfields.append(('j', author['orcid']))

                    if 'affiliation' in author:
                        for aff in author["affiliation"]:
                            subfields.append(('v', aff))

                        add_nations_field(subfields)

                    if author.get('email'):
                        subfields.append(('m', author['email']))

                    record.add_field(field + '__',
                                     value='',
                                     subfields=subfields)
    def get_record(self, path=None, no_pdf=False,
                   test=False, refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec, '260', subfields=[('c', start_date),
                                                    ('t', 'published')])
        else:
            record_add_field(
                rec, '260', subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi),
                                                              ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec, '540', subfields=[('a', license),
                                                    ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get(
                'given_name') or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract),
                                                    ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec, '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            if keywords:
                for keyword in keywords:
                    record_add_field(rec, '653', ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec, '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' %
                                       (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec, '856', ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec, 'FFT', subfields=[('a', path),
                                                            ('t', 'INSPIRE-PUBLIC'),
                                                            ('d', 'Fulltext')])
                else:
                    record_add_field(rec, 'FFT', subfields=[('a', path),
                                                            ('t', 'Elsevier'),
                                                            ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(
                        rec, '653', ind1='1', subfields=[('a', keyword),
                                    ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi,)
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(
                            ".pdf;pdfa", exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec, 'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: '
                                   + doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec, 'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s'
                                          % (doi,))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec, 'FFT', subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi,)
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec, '980', subfields=[('a', 'SCOAP3'),
                                                        ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #7
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = super(NLMParser, self).get_article(f_path)
        rec = create_record()
        title = super(NLMParser, self).get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        record_add_field(rec, '260', subfields=[('c', super(NLMParser, self).get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = super(NLMParser, self).get_publication_information(xml)
        journal = "PTEP"  # Let's override the journal information

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        page_count = super(NLMParser, self).get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        arxiv = self.get_arxiv_id(xml)
        if arxiv:
            record_add_field(rec, '037', subfields=[('9', 'arXiv'), ('a', format_arxiv_id(arxiv))])
        authors = super(NLMParser, self).get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = super(NLMParser, self).get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = super(NLMParser, self).get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = super(NLMParser, self).get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])

        ## Oxford is giving us bad keywords. Better ignore them.
        #if keywords['other']:
            #for keyword in keywords['other']:
                #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        f_path_pdf = f_path[:-(len('.xml'))] + '.pdf'
        f_path_pdfa = join(dirname(f_path), 'archival_pdfs', basename(f_path)[:-len('.xml')] + '-hires.pdf')
        if exists(f_path_pdf):
            record_add_field(rec, 'FFT', subfields=[('a', f_path_pdf), ('n', 'main')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF." % (doi,))
                logger.warning("Record %s doesn't contain PDF file." % (doi,))
        if exists(f_path_pdfa):
            record_add_field(rec, 'FFT', subfields=[('a', f_path_pdfa), ('n', 'main'), ('f', '.pdf;pdfa')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True, prefix="Oxford paper: %s is missing PDF/A." % (doi,))
                logger.warning("Record %s doesn't contain PDF/A file." % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Example #8
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])

        record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml)

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        page_count = self.get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = self.get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])
        if keywords['other']:
            for keyword in keywords['other']:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
        pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf')
        try:
            open(pdf_path)
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        except:
            register_exception(alert_admin=True)
            logger.error("No PDF for paper: %s" % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Example #9
0
    def get_record(self,
                   path=None,
                   no_pdf=False,
                   test=False,
                   refextract_callback=None):
        """Convert a record to MARCXML format.

        :param path: path to a record.
        :type path: string
        :param test: flag to determine if it is a test call.
        :type test: bool
        :param refextract_callback: callback to be used to extract
                                    unstructured references. It should
                                    return a marcxml formated string
                                    of the reference.
        :type refextract_callback: callable

        :returns: marcxml formated string.
        """
        xml_doc = self.get_article(path)
        rec = create_record()
        title = self.get_title(xml_doc)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        (journal, dummy, volume, issue, first_page, last_page, year,
         start_date, doi) = self.get_publication_information(xml_doc, path)
        if not journal:
            journal = self.get_article_journal(xml_doc)
        if start_date:
            record_add_field(rec,
                             '260',
                             subfields=[('c', start_date), ('t', 'published')])
        else:
            record_add_field(rec,
                             '260',
                             subfields=[('c', time.strftime('%Y-%m-%d'))])
        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        license, license_url = self.get_license(xml_doc)
        if license and license_url:
            record_add_field(rec,
                             '540',
                             subfields=[('a', license), ('u', license_url)])
        elif license_url:
            record_add_field(rec, '540', subfields=[('u', license_url)])
        self.logger.info("Creating record: %s %s" % (path, doi))
        authors = self.get_authors(xml_doc)
        first_author = True
        for author in authors:
            author_name = (author['surname'], author.get('given_name')
                           or author.get('initials'))
            subfields = [('a', '%s, %s' % author_name)]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml_doc)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', 'Elsevier')])
        record_copyright = self.get_copyright(xml_doc)
        if record_copyright:
            record_add_field(rec, '542', subfields=[('f', record_copyright)])
        keywords = self.get_keywords(xml_doc)
        if self.CONSYN:
            for tag in xml_doc.getElementsByTagName('ce:collaboration'):
                collaboration = get_value_in_tag(tag, 'ce:text')
                if collaboration:
                    record_add_field(rec,
                                     '710',
                                     subfields=[('g', collaboration)])

            # We add subjects also as author keywords
            subjects = xml_doc.getElementsByTagName('dct:subject')
            for subject in subjects:
                for listitem in subject.getElementsByTagName('rdf:li'):
                    keyword = xml_to_text(listitem)
                    if keyword not in keywords:
                        keywords.append(keyword)
            for keyword in keywords:
                record_add_field(rec,
                                 '653',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'author')])
            journal, dummy = fix_journal_name(journal.strip(),
                                              self.journal_mappings)
            subfields = []
            doctype = self.get_doctype(xml_doc)
            try:
                page_count = int(last_page) - int(first_page) + 1
                record_add_field(rec,
                                 '300',
                                 subfields=[('a', str(page_count))])
            except ValueError:  # do nothing
                pass
            if doctype == 'err':
                subfields.append(('m', 'Erratum'))
            elif doctype == 'add':
                subfields.append(('m', 'Addendum'))
            elif doctype == 'pub':
                subfields.append(('m', 'Publisher Note'))
            elif doctype == 'rev':
                record_add_field(rec, '980', subfields=[('a', 'Review')])
            if journal:
                subfields.append(('p', journal))
            if first_page and last_page:
                subfields.append(('c', '%s-%s' % (first_page, last_page)))
            elif first_page:
                subfields.append(('c', first_page))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            record_add_field(rec, '773', subfields=subfields)
            if not test:
                if license:
                    url = 'http://www.sciencedirect.com/science/article/pii/'\
                          + path.split('/')[-1][:-4]
                    record_add_field(rec,
                                     '856',
                                     ind1='4',
                                     subfields=[('u', url),
                                                ('y', 'Elsevier server')])
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path),
                                                ('t', 'INSPIRE-PUBLIC'),
                                                ('d', 'Fulltext')])
                else:
                    record_add_field(rec,
                                     'FFT',
                                     subfields=[('a', path), ('t', 'Elsevier'),
                                                ('o', 'HIDDEN')])
            record_add_field(rec, '980', subfields=[('a', 'HEP')])
            record_add_field(rec, '980', subfields=[('a', 'Citeable')])
            record_add_field(rec, '980', subfields=[('a', 'Published')])
            self._add_references(xml_doc, rec, refextract_callback)
        else:
            licence = 'http://creativecommons.org/licenses/by/3.0/'
            record_add_field(rec,
                             '540',
                             subfields=[('a', 'CC-BY-3.0'), ('u', licence)])
            if keywords:
                for keyword in keywords:
                    record_add_field(rec,
                                     '653',
                                     ind1='1',
                                     subfields=[('a', keyword),
                                                ('9', 'author')])

            pages = ''
            if first_page and last_page:
                pages = '{0}-{1}'.format(first_page, last_page)
            elif first_page:
                pages = first_page

            subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                                ('v', volume),
                                                                ('n', issue),
                                                                ('c', pages),
                                                                ('y', year)])

            record_add_field(rec, '773', subfields=subfields)
            if not no_pdf:
                from invenio.search_engine import perform_request_search
                query = '0247_a:"%s" AND NOT 980:DELETED"' % (doi, )
                prev_version = perform_request_search(p=query)

                old_pdf = False

                if prev_version:
                    from invenio.bibdocfile import BibRecDocs
                    prev_rec = BibRecDocs(prev_version[0])
                    try:
                        pdf_path = prev_rec.get_bibdoc('main')
                        pdf_path = pdf_path.get_file(".pdf;pdfa",
                                                     exact_docformat=True)
                        pdf_path = pdf_path.fullpath
                        old_pdf = True
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        message = ('Leaving previously delivered PDF/A for: ' +
                                   doi)
                        self.logger.info(message)
                    except:
                        pass
                try:
                    if exists(join(path, 'main_a-2b.pdf')):
                        pdf_path = join(path, 'main_a-2b.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path),
                                                    ('n', 'main'),
                                                    ('f', '.pdf;pdfa')])
                        self.logger.debug('Adding PDF/A to record: %s' %
                                          (doi, ))
                    elif exists(join(path, 'main.pdf')):
                        pdf_path = join(path, 'main.pdf')
                        record_add_field(rec,
                                         'FFT',
                                         subfields=[('a', pdf_path)])
                    else:
                        if not old_pdf:
                            message = "Record " + doi
                            message += " doesn't contain PDF file."
                            self.logger.warning(message)
                            raise MissingFFTError(message)
                except MissingFFTError:
                    message = "Elsevier paper: %s is missing PDF." % (doi, )
                    register_exception(alert_admin=True, prefix=message)
                version = self.get_elsevier_version(find_package_name(path))
                record_add_field(rec, '583', subfields=[('l', version)])
                xml_path = join(path, 'main.xml')
                record_add_field(rec, 'FFT', subfields=[('a', xml_path)])
                record_add_field(rec,
                                 '980',
                                 subfields=[('a', 'SCOAP3'),
                                            ('b', 'Elsevier')])
        try:
            return record_xml_output(rec)
        except UnicodeDecodeError:
            message = "Found a bad char in the file for the article " + doi
            sys.stderr.write(message)
            return ""
Example #10
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        #path = abspath(join(f_path, pardir))
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        publication_date = self.get_publication_date(xml)
        if publication_date:
            record_add_field(rec, '260', subfields=[('c', publication_date)])
        journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml)
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        arxiv_id = self.get_arxiv_id(xml)
        if arxiv_id:
            record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')])
        if logger:
            logger.info("Creating record: %s %s" % (f_path, doi))
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')])
        copyright = self.get_copyright(xml)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords:
            for keyword in keywords:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        record_add_field(rec, "300", subfields=[('a', pages)])
        record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('c', first_page), ('y', year)])
        references = self.get_references(xml)
        for label, authors, doi, issue, page, title, volume, year in references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if page:
                subfields.append(('p', page))
            subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            if title:
                subfields.append(('t', title))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)

        folder_name = join('/', *(f_path.split('/')[0:-1]))
        pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf'
        pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name)
        print pdf_path
        if exists(pdf_path):
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        else:
            # Don't know why it doesn't work????????????
            # register_exception(alert_admin=True)
            if logger:
                logger.error("Record %s doesn't contain PDF file." % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')])
        record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)])
        return record_xml_output(rec)
Example #11
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        #path = abspath(join(f_path, pardir))
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        publication_date = self.get_publication_date(xml)
        if publication_date:
            record_add_field(rec, '260', subfields=[('c', publication_date)])
        journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml)
        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        arxiv_id = self.get_arxiv_id(xml)
        if arxiv_id:
            record_add_field(rec, '037', subfields=[('a', arxiv_id), ('9', 'arXiv')])
        if logger:
            logger.info("Creating record: %s %s" % (f_path, doi))
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            subfields = [('a', '%s, %s' % (author['surname'], author.get('given_name') or author.get('initials')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-4.0'), ('u', 'http://creativecommons.org/licenses/by/4.0/')])
        copyright = self.get_copyright(xml)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords:
            for keyword in keywords:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        record_add_field(rec, "300", subfields=[('a', pages)])

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('c', first_page),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)
        references = self.get_references(xml)
        for label, authors, doi, issue, page, title, volume, year in references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if page:
                subfields.append(('p', page))
            subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            if title:
                subfields.append(('t', title))
            if volume:
                subfields.append(('v', volume))
            if year:
                subfields.append(('y', year))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)

        folder_name = join('/', *(f_path.split('/')[0:-1]))
        pdf_name = f_path.split('/')[-1].rstrip('.xml.scoap') + '.pdf'
        pdf_path = join(folder_name, 'BodyRef/PDF', pdf_name)
        print pdf_path
        if exists(pdf_path):
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        else:
            # Don't know why it doesn't work????????????
            # register_exception(alert_admin=True)
            if logger:
                logger.error("Record %s doesn't contain PDF file." % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', self.get_body_ref(xml)), ('n', 'main')])
        record_add_field(rec, '980', subfields=[('a', collection), ('b', publisher)])
        return record_xml_output(rec)
Example #12
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = super(NLMParser, self).get_article(f_path)
        rec = create_record()
        title = super(NLMParser, self).get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])
        record_add_field(rec,
                         '260',
                         subfields=[
                             ('c',
                              super(NLMParser,
                                    self).get_publication_date(xml, logger))
                         ])
        journal, issn, volume, issue, first_page, last_page, year, doi = super(
            NLMParser, self).get_publication_information(xml)
        journal = "PTEP"  # Let's override the journal information

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec,
                             '024',
                             ind1='7',
                             subfields=[('a', doi), ('2', 'DOI')])
        page_count = super(NLMParser, self).get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        arxiv = self.get_arxiv_id(xml)
        if arxiv:
            record_add_field(rec,
                             '037',
                             subfields=[('9', 'arXiv'),
                                        ('a', format_arxiv_id(arxiv))])
        authors = super(NLMParser, self).get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' %
                              (author.get('surname'), author.get('given_name')
                               or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        abstract = super(NLMParser, self).get_abstract(xml)
        if abstract:
            record_add_field(rec,
                             '520',
                             subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec,
                         '540',
                         subfields=[
                             ('a', 'CC-BY-3.0'),
                             ('u',
                              'http://creativecommons.org/licenses/by/3.0/')
                         ])
        copyright = super(NLMParser, self).get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = super(NLMParser, self).get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec,
                                 '084',
                                 ind1='1',
                                 subfields=[('a', keyword), ('9', 'PACS')])

        ## Oxford is giving us bad keywords. Better ignore them.
        #if keywords['other']:
        #for keyword in keywords['other']:
        #record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(
                    ('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(
                    ('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec,
                                 '999',
                                 ind1='C',
                                 ind2='5',
                                 subfields=subfields)
        f_path_pdf = f_path[:-(len('.xml'))] + '.pdf'
        f_path_pdfa = join(dirname(f_path), 'archival_pdfs',
                           basename(f_path)[:-len('.xml')] + '-hires.pdf')
        if exists(f_path_pdf):
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', f_path_pdf), ('n', 'main')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(alert_admin=True,
                                   prefix="Oxford paper: %s is missing PDF." %
                                   (doi, ))
                logger.warning("Record %s doesn't contain PDF file." % (doi, ))
        if exists(f_path_pdfa):
            record_add_field(rec,
                             'FFT',
                             subfields=[('a', f_path_pdfa), ('n', 'main'),
                                        ('f', '.pdf;pdfa')])
        else:
            try:
                raise MissingFFTError
            except:
                register_exception(
                    alert_admin=True,
                    prefix="Oxford paper: %s is missing PDF/A." % (doi, ))
                logger.warning("Record %s doesn't contain PDF/A file." %
                               (doi, ))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)
Example #13
0
def check_records(records, empty=False):
    fields = ['100', '700']
    #filepath = "/opt/invenio/var/data/files/g0/"
    #filepath2 = "/opt/invenio/var/data/files/g1/"
    filepath = '/opt/invenio/var/data/files/'
    filepaths = os.listdir(filepath)

    for record in records:
        first_author = True
        if is_elsevier(record):
            doc_ids = get_doc_ids(int(record.record_id))
            for doc_id in doc_ids:
                # try:
                #     latest_file = get_latest_file(filepath + str(doc_id) + '/')
                # except:
                #     latest_file = get_latest_file(filepath2 + str(doc_id) + '/')
                latest_file = None
                for folder in filepaths:
                    try:
                        latest_file = get_latest_file(filepath + '/' + folder +
                                                      '/' + str(doc_id) + '/')
                        if latest_file:
                            break
                    except:
                        print "No folder with name %s in %s directory" % (
                            doc_id, folder)

                try:
                    xml = parse(latest_file)
                except:
                    record.warn("Problem parssing XML file. Aborting")
                    break
                authors = get_authors(xml)

                delete_fields(record, fields)

                for author in authors:
                    field = '100' if first_author else '700'
                    first_author = False

                    subfields = []
                    author_name = (author['surname'], author.get('given_name')
                                   or author.get('initials'))
                    author_name = ('a', '%s, %s' % author_name)
                    subfields.append(author_name)

                    if 'orcid' in author:
                        subfields.append(('j', author['orcid']))

                    if 'affiliation' in author:
                        for aff in author["affiliation"]:
                            subfields.append(('v', aff))

                        add_nations_field(subfields)

                    if author.get('email'):
                        subfields.append(('m', author['email']))

                    record.add_field(field + '__',
                                     value='',
                                     subfields=subfields)
Example #14
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        # path = abspath(join(f_path, pardir))
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, "245", subfields=[("a", title)])
        publication_date = self.get_publication_date(xml)
        if publication_date:
            record_add_field(rec, "260", subfields=[("c", publication_date)])
        journal, issn, volume, issue, first_page, pages, year, doi = self.get_publication_information(xml)
        if doi:
            record_add_field(rec, "024", ind1="7", subfields=[("a", doi), ("2", "DOI")])
        arxiv_id = self.get_arxiv_id(xml)
        if arxiv_id:
            record_add_field(rec, "037", subfields=[("a", arxiv_id), ("9", "arXiv")])
        if logger:
            logger.info("Creating record: %s %s" % (f_path, doi))
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            subfields = [("a", "%s, %s" % (author["surname"], author.get("given_name") or author.get("initials")))]
            if "orcid" in author:
                subfields.append(("j", author["orcid"]))
            if "affiliation" in author:
                for aff in author["affiliation"]:
                    subfields.append(("v", aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if first_author:
                record_add_field(rec, "100", subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, "700", subfields=subfields)

        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, "520", subfields=[("a", abstract)])
        record_add_field(
            rec, "540", subfields=[("a", "CC-BY-4.0"), ("u", "http://creativecommons.org/licenses/by/4.0/")]
        )
        copyright = self.get_copyright(xml)
        if copyright:
            record_add_field(rec, "542", subfields=[("f", copyright)])
        keywords = self.get_keywords(xml)
        if keywords:
            for keyword in keywords:
                record_add_field(rec, "653", ind1="1", subfields=[("a", keyword), ("9", "author")])
        record_add_field(rec, "300", subfields=[("a", pages)])

        subfields = filter(
            lambda x: x[1] and x[1] != "-", [("p", journal), ("v", volume), ("c", first_page), ("y", year)]
        )
        record_add_field(rec, "773", subfields=subfields)
        references = self.get_references(xml)
        for label, authors, doi, issue, page, title, volume, year in references:
            subfields = []
            if doi:
                subfields.append(("a", doi))
            for author in authors:
                subfields.append(("h", author))
            if issue:
                subfields.append(("n", issue))
            if label:
                subfields.append(("o", label))
            if page:
                subfields.append(("p", page))
            subfields.append(("s", "%s %s (%s) %s" % (title, volume, year, page)))
            if title:
                subfields.append(("t", title))
            if volume:
                subfields.append(("v", volume))
            if year:
                subfields.append(("y", year))
            if subfields:
                record_add_field(rec, "999", ind1="C", ind2="5", subfields=subfields)

        folder_name = join("/", *(f_path.split("/")[0:-1]))
        pdf_name = f_path.split("/")[-1].rstrip(".xml.scoap") + ".pdf"
        pdf_path = join(folder_name, "BodyRef/PDF", pdf_name)
        print pdf_path
        if exists(pdf_path):
            record_add_field(rec, "FFT", subfields=[("a", pdf_path), ("n", "main"), ("f", ".pdf;pdfa")])
        else:
            # Don't know why it doesn't work????????????
            # register_exception(alert_admin=True)
            if logger:
                logger.error("Record %s doesn't contain PDF file." % (doi,))
        record_add_field(rec, "FFT", subfields=[("a", self.get_body_ref(xml)), ("n", "main")])
        record_add_field(rec, "980", subfields=[("a", collection), ("b", publisher)])
        return record_xml_output(rec)
Example #15
0
    def get_record(self, f_path, publisher=None, collection=None, logger=None):
        xml = self.get_article(f_path)
        rec = create_record()
        title = self.get_title(xml)
        if title:
            record_add_field(rec, '245', subfields=[('a', title)])

        record_add_field(rec, '260', subfields=[('c', self.get_publication_date(xml, logger))])
        journal, issn, volume, issue, first_page, last_page, year, doi = self.get_publication_information(xml)

        if logger:
            logger.info("Creating record: %s %s" % (join(f_path, pardir), doi))

        if doi:
            record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')])
        authors = self.get_authors(xml)
        first_author = True
        for author in authors:
            if author.get('surname'):
                subfields = [('a', '%s, %s' % (author.get('surname'), author.get('given_name') or author.get('initials', '')))]
            else:
                subfields = [('a', '%s' % (author.get('name', '')))]
            if 'orcid' in author:
                subfields.append(('j', author['orcid']))
            if 'affiliation' in author:
                for aff in author["affiliation"]:
                    subfields.append(('v', aff))

                if self.extract_nations:
                    add_nations_field(subfields)

            if author.get('email'):
                    subfields.append(('m', author['email']))
            if first_author:
                record_add_field(rec, '100', subfields=subfields)
                first_author = False
            else:
                record_add_field(rec, '700', subfields=subfields)

        page_count = self.get_page_count(xml)
        if page_count:
            record_add_field(rec, '300', subfields=[('a', page_count)])
        abstract = self.get_abstract(xml)
        if abstract:
            record_add_field(rec, '520', subfields=[('a', abstract), ('9', publisher)])
        record_add_field(rec, '540', subfields=[('a', 'CC-BY-3.0'), ('u', 'http://creativecommons.org/licenses/by/3.0/')])
        copyright = self.get_copyright(xml, logger)
        if copyright:
            record_add_field(rec, '542', subfields=[('f', copyright)])
        keywords = self.get_keywords(xml)
        if keywords['pacs']:
            for keyword in keywords['pacs']:
                record_add_field(rec, '084', ind1='1', subfields=[('a', keyword), ('9', 'PACS')])
        if keywords['other']:
            for keyword in keywords['other']:
                record_add_field(rec, '653', ind1='1', subfields=[('a', keyword), ('9', 'author')])
        if first_page or last_page:
            pages = '%s-%s' % (first_page, last_page)
        else:
            article_meta = xml.getElementsByTagName('article-meta')[0]
            pages = get_value_in_tag(article_meta, "elocation-id")

        subfields = filter(lambda x: x[1] and x[1] != '-', [('p', journal),
                                                            ('v', volume),
                                                            ('n', issue),
                                                            ('c', pages),
                                                            ('y', year)])
        record_add_field(rec, '773', subfields=subfields)

        self.get_references(xml)
        for label, authors, doi, issue, page, page_last, title, volume, year, ext_link, plain_text in self.references:
            subfields = []
            if doi:
                subfields.append(('a', doi))
            for author in authors:
                subfields.append(('h', author))
            if issue:
                subfields.append(('n', issue))
            if label:
                subfields.append(('o', label))
            if year:
                subfields.append(('y', year))
            if ext_link:
                subfields.append(('r', ext_link))
            # should we be strict about it?
            if title and volume and year and page:
                subfields.append(('s', '%s %s (%s) %s' % (title, volume, year, page)))
            elif not plain_text:
                subfields.append(('m', ('%s %s %s %s' % (title, volume, year, page))))
            if plain_text:
                subfields.append(('m', plain_text))
            if subfields:
                record_add_field(rec, '999', ind1='C', ind2='5', subfields=subfields)
        # record_add_field(rec, 'FFT', subfields=[('a', join(path, 'main.pdf'))])
        pdf_path = join(dirname(f_path), 'BodyRef', 'PDF', basename(f_path)[:-len('_nlm.xml')] + '.pdf')
        try:
            open(pdf_path)
            record_add_field(rec, 'FFT', subfields=[('a', pdf_path), ('n', 'main'), ('f', '.pdf;pdfa')])
        except:
            register_exception(alert_admin=True)
            logger.error("No PDF for paper: %s" % (doi,))
        record_add_field(rec, 'FFT', subfields=[('a', f_path), ('n', 'main')])
        extra_subfields = []
        if collection:
            extra_subfields.append(('a', collection))
        if publisher:
            extra_subfields.append(('b', publisher))
        record_add_field(rec, '980', subfields=extra_subfields)
        return record_xml_output(rec)