Esempio n. 1
0
def populate_arxiv_document(obj, eng):
    arxiv_id = get_arxiv_id(obj.data)
    url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)

    if not is_pdf_link(url):
        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
Esempio n. 2
0
def test_get_arxiv_id_returns_empty_string_when_no_arxiv_eprints():
    record = {}

    expected = ''
    result = get_arxiv_id(record)

    assert expected == result
Esempio n. 3
0
def test_get_arxiv_id_returns_empty_string_when_arxiv_eprints_is_empty():
    record = {'arxiv_eprints': []}

    expected = ''
    result = get_arxiv_id(record)

    assert expected == result
Esempio n. 4
0
def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    arxiv_id = get_arxiv_id(obj.data)
    categories = get_value(obj.data, 'arxiv_eprints.categories')

    if arxiv_id or categories:
        return True
    return False
Esempio n. 5
0
def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    arxiv_id = get_arxiv_id(obj.data)
    categories = get_value(obj.data, 'arxiv_eprints.categories')

    if arxiv_id or categories:
        return True
    return False
Esempio n. 6
0
def match_by_arxiv_id(record):
    """Match by arXiv identifier."""
    arxiv_id = get_arxiv_id(record)

    if arxiv_id:
        query = '035__a:oai:arXiv.org:{0}'.format(arxiv_id)
        return search(query)

    return list()
Esempio n. 7
0
def already_harvested(obj, eng):
    """Check if record is already harvested."""
    if is_being_harvested_on_legacy(obj.data):
        obj.log.info(('Record with arXiv id {arxiv_id} is'
                      ' already being harvested on Legacy.').format(
                          arxiv_id=get_arxiv_id(obj.data)))
        return True

    return False
Esempio n. 8
0
def match_by_arxiv_id(record):
    """Match by arXiv identifier."""
    arxiv_id = get_arxiv_id(record)

    if arxiv_id:
        query = '035:"{0}"'.format(arxiv_id)
        return search(query)

    return list()
Esempio n. 9
0
def already_harvested(obj, eng):
    """Check if record is already harvested."""
    if is_being_harvested_on_legacy(obj.data):
        obj.log.info((
            'Record with arXiv id {arxiv_id} is'
            ' already being harvested on Legacy.'
        ).format(arxiv_id=get_arxiv_id(obj.data)))
        return True

    return False
Esempio n. 10
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found'
                % filename)
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [
                path for path in file_list if path.endswith('.xml')
            ]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(
                        marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Esempio n. 11
0
def test_get_arxiv_id_returns_first_arxiv_identifier():
    record = {
        'arxiv_eprints': [
            {'value': 'first arXiv identifier'},
            {'value': 'second arXiv identifier'},
        ],
    }

    expected = 'first arXiv identifier'
    result = get_arxiv_id(record)

    assert expected == result
Esempio n. 12
0
def test_get_arxiv_id_returns_first_arxiv_identifier():
    record = {
        'arxiv_eprints': [
            {'value': 'first arXiv identifier'},
            {'value': 'second arXiv identifier'},
        ],
    }

    expected = 'first arXiv identifier'
    result = get_arxiv_id(record)

    assert expected == result
Esempio n. 13
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if not tarball:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found' % filename
            )
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [path for path in file_list if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                obj.data['authors'] = extracted_authors
Esempio n. 14
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space:
            try:
                plots = process_tarball(tarball.file.uri,
                                        output_directory=scratch_space)
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info('Invalid tarball %s for arxiv_id %s',
                             tarball.file.uri, arxiv_id)
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id)
                current_app.logger.exception(err)
                return

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                files_keys = obj.files.keys
                key = plot_name
                if plot_name in files_keys:
                    key = '{number}_{name}'.format(number=index,
                                                   name=plot_name)

                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(key=key,
                              caption=''.join(plot.get('captions', [])),
                              label=plot.get('label'),
                              material='preprint',
                              url='/api/files/{bucket}/{key}'.format(
                                  bucket=obj.files[key].bucket_id, key=key))

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))
Esempio n. 15
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if tarball:
            with TemporaryDirectory(prefix='author_list') as scratch_space:
                tarball_file = retrieve_uri(
                    tarball.file.uri,
                    outdir=scratch_space,
                )
                try:
                    file_list = untar(tarball_file, scratch_space)
                except InvalidTarball:
                    obj.log.info(
                        'Invalid tarball %s for arxiv_id %s',
                        tarball.file.uri,
                        arxiv_id,
                    )
                    return
                obj.log.info('Extracted tarball to: {0}'.format(scratch_space))

                xml_files_list = [
                    path for path in file_list if path.endswith('.xml')
                ]
                obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

                for xml_file in xml_files_list:
                    with open(xml_file, 'r') as xml_file_fd:
                        xml_content = xml_file_fd.read()

                    match = REGEXP_AUTHLIST.findall(xml_content)
                    if match:
                        obj.log.info('Found a match for author extraction')
                        try:
                            authors_xml = convert(xml_content, stylesheet)
                        except XMLSyntaxError:
                            # Probably the %auto-ignore comment exists, so we skip the
                            # first line. See: inspirehep/inspire-next/issues/2195
                            authors_xml = convert(
                                xml_content.split('\n', 1)[1],
                                stylesheet,
                            )
                        authorlist_record = marcxml2record(authors_xml)
                        obj.data.update(authorlist_record)
                        break
Esempio n. 16
0
def arxiv_package_download(obj, eng):
    """Perform the package download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id),
    )

    if tarball:
        obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
Esempio n. 17
0
def arxiv_package_download(obj, eng):
    """Perform the package download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id),
    )

    if tarball:
        obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
Esempio n. 18
0
def test_get_arxiv_id():
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    record = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                    'hep-ph',
                ],
                'value': '1612.08928',
            },
        ],
    }
    assert validate(record['arxiv_eprints'], subschema) is None

    expected = '1612.08928'
    result = get_arxiv_id(record)

    assert expected == result
Esempio n. 19
0
def _get_preprint_context(record):
    abstract = get_abstract(record)
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ''

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': get_arxiv_id(record),
        'authors': get_authors(record),
        'collaborations': get_collaborations(record),
        'divulgation': get_divulgation(record),
        'domains': get_domains(record),
        'inspire_id': get_inspire_id(record),
        'keywords': get_keywords(record),
        'language': get_language(record),
        'subtitle': get_subtitle(record),
        'title': get_title(record),
    }
Esempio n. 20
0
def _get_comm_context(record):
    abstract = get_abstract(record)
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ''

    conference_record = get_conference_record(record)
    conference_city = get_conference_city(conference_record)
    conference_country = get_conference_country(conference_record)
    conference_end_date = get_conference_end_date(conference_record)
    conference_start_date = get_conference_start_date(conference_record)
    conference_title = get_conference_title(conference_record)

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': get_arxiv_id(record),
        'authors': get_authors(record),
        'collaborations': get_collaborations(record),
        'conference_city': conference_city,
        'conference_country': conference_country,
        'conference_end_date': conference_end_date,
        'conference_start_date': conference_start_date,
        'conference_title': conference_title,
        'divulgation': get_divulgation(record),
        'doi': get_doi(record),
        'domains': get_domains(record),
        'inspire_id': get_inspire_id(record),
        'journal_issue': get_journal_issue(record),
        'journal_title': get_journal_title(record),
        'journal_volume': get_journal_volume(record),
        'keywords': get_keywords(record),
        'language': get_language(record),
        'page_artid': get_page_artid(record),
        'peer_reviewed': get_peer_reviewed(record),
        'publication_date': get_publication_date(record),
        'subtitle': get_subtitle(record),
        'title': get_title(record),
    }
Esempio n. 21
0
def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)

    if not is_pdf_link(url):
        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    pdf = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=url,
    )

    if pdf:
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(source='arxiv', record=obj.data)
        lb.add_document(filename,
                        fulltext=True,
                        hidden=True,
                        material='preprint',
                        original_url=url,
                        url='/api/files/{bucket}/{key}'.format(
                            bucket=obj.files[filename].bucket_id,
                            key=filename))
        obj.data = lb.record
        obj.log.info('PDF retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)
Esempio n. 22
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if tarball:
            sub_dir = os.path.abspath('{0}_files'.format(tarball.file.uri))
            try:
                file_list = untar(tarball.file.uri, sub_dir)
            except InvalidTarball:
                obj.log.error('Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id)
                return
            obj.log.info('Extracted tarball to: {0}'.format(sub_dir))

            xml_files_list = [path for path in file_list
                              if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )
                    authors_rec = create_record(authors_xml)
                    authorlist_record = hep.do(authors_rec)
                    obj.data.update(authorlist_record)
                    break
Esempio n. 23
0
def _get_art_context(record):
    abstract = get_abstract(record)
    abstract_language = langdetect.detect(abstract)

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': get_arxiv_id(record),
        'authors': get_authors(record),
        'collaborations': get_collaborations(record),
        'divulgation': get_divulgation(record),
        'doi': get_doi(record),
        'domain': get_domain(record),
        'inspire_id': get_inspire_id(record),
        'journal_issue': get_journal_issue(record),
        'journal_title': get_journal_title(record),
        'journal_volume': get_journal_volume(record),
        'language': get_language(record),
        'page_artid': get_page_artid(record),
        'peer_reviewed': get_peer_reviewed(record),
        'publication_date': get_publication_date(record),
        'title': get_title(record),
    }
Esempio n. 24
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                plots = process_tarball(
                    tarball_file,
                    output_directory=scratch_space,
                )
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id,
                )
                current_app.logger.exception(err)
                return

            if 'figures' in obj.data:
                for figure in obj.data['figures']:
                    if figure['key'] in obj.files:
                        del obj.files[figure['key']]
                del obj.data['figures']

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                key = plot_name
                if plot_name in obj.files.keys:
                    key = 'w{number}_{name}'.format(
                        number=index,
                        name=plot_name,
                    )
                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(
                    key=key,
                    caption=''.join(plot.get('captions', [])),
                    label=plot.get('label'),
                    material='preprint',
                    url='/api/files/{bucket}/{key}'.format(
                        bucket=obj.files[key].bucket_id,
                        key=key,
                    )
                )

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))