Example #1
0
def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball):
    mock_process_tarball.side_effect = DelegateError

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1612.00624'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'physics.ins-det',
                ],
                'value': '1612.00624',
            },
        ],
    }  # synthetic data
    extra_data = {}
    files = MockFiles({
        '1612.00624.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_plot_extract(obj, eng) is None
    assert '1612.00624' in obj.log._error.getvalue()
def test_populate_submission_document_without_pdf():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET', 'http://export.arxiv.org/pdf/1707.02785',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1707.02785.html')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['acquisition_source']
        data = {
            'acquisition_source': {
                'datetime': '2017-11-30T16:38:43.352370',
                'email': '*****@*****.**',
                'internal_uid': 54252,
                'method': 'submitter',
                'orcid': '0000-0002-2174-4493',
                'source': 'submitter',
                'submission_number': '1'
            }
        }
        assert validate(data['acquisition_source'], subschema) is None

        extra_data = {
            'submission_pdf': 'http://export.arxiv.org/pdf/1707.02785',
        }
        files = MockFiles({})
        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_submission_document(obj, eng) is None

        documents = obj.data.get('documents', [])
        assert 0 == len(documents)
Example #3
0
def test_arxiv_author_list_handles_auto_ignore_comment():
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
Example #4
0
def test_arxiv_author_list_with_missing_tarball():
    schema = load_schema('hep')

    eprints_subschema = schema['properties']['arxiv_eprints']
    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        'jessica.jones.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': 'alias.investigations',
        })})
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    expected_message = \
        'Skipping author list extraction, no tarball with name "1703.09986.tar.gz" found'

    assert default_arxiv_author_list(obj, eng) is None

    assert expected_message in obj.log._info.getvalue()
def test_arxiv_plot_extract_is_safe_to_rerun(mock_os):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '0804.1873.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'nucl-ex',
                ],
                'value': '0804.1873',
            },
        ],
    }  # literature/783246
    extra_data = {}
    files = MockFiles({
        '0804.1873.tar.gz':
        AttrDict({
            'file': AttrDict({
                'uri': filename,
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        for _ in range(2):
            assert arxiv_plot_extract(obj, eng) is None

            expected_figures = [{
                'url':
                '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/figure1.png',
                'source':
                'arxiv',
                'material':
                'preprint',
                'key':
                'figure1.png',
                'caption':
                'Difference (in MeV) between the theoretical and experimental masses for the 2027 selected nuclei as a function of the mass number.'
            }]
            result = obj.data['figures']

            assert expected_figures == result

            expected_files = ['0804.1873.tar.gz', 'figure1.png']

            assert expected_files == obj.files.keys

    finally:
        rmtree(temporary_dir)
Example #6
0
def test_arxiv_package_download_logs_on_success():
    httpretty.register_uri(httpretty.GET,
                           'http://export.arxiv.org/e-print/1605.03959',
                           body=pkg_resources.resource_string(
                               __name__,
                               os.path.join('fixtures', '1605.03959.tar.gz')))

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                    'cond-mat.stat-mech',
                    'cond-mat.str-el',
                ],
                'value': '1605.03959',
            },
        ],
    }  # literature/1458968
    extra_data = {}
    files = MockFiles({})
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_package_download(obj, eng) is None

    expected = 'Tarball retrieved from arXiv for 1605.03959'
    result = obj.log._info.getvalue()

    assert expected == result
Example #7
0
def test_populate_arxiv_document_retries_on_connection_error(
        mock_requests_get):
    mock_requests_get.side_effect = side_effect_requests_get

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1605.03814',
            },
        ],
    }  # literature/1458270

    extra_data = {}
    files = MockFiles({})
    assert validate(data['arxiv_eprints'], subschema) is None
    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    with pytest.raises(DownloadError):
        populate_arxiv_document(obj, eng)

    assert mock_requests_get.call_count == 10
def test_prepare_files_ignores_keys_not_ending_with_pdf():
    data = {}
    extra_data = {}
    files = MockFiles({
        'foo.bar':
        AttrDict({
            'obj':
            AttrDict({
                'file': AttrDict({
                    'uri': '/data/foo.pdf',
                }),
            }),
        }),
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert prepare_files(obj, eng) is None

    expected = {}
    result = obj.data

    assert expected == result

    expected = ''
    result = obj.log._info.getvalue()

    assert expected == result
def test_download_documents():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET', 'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['documents']

        data = {
            'documents': [
                {
                    'key': '1605.03844.pdf',
                    'url': 'http://export.arxiv.org/pdf/1605.03844'
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['documents'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert download_documents(obj, eng) is None

        documents = obj.data['documents']
        expected_document_url = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf'

        assert 1 == len(documents)
        assert expected_document_url == documents[0]['url']
Example #10
0
def test_arxiv_author_list_logs_on_error(mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1605.07707'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
    assert '1605.07707' in obj.log._info.getvalue()
def test_prepare_files_annotates_files_from_arxiv():
    schema = load_schema('hep')
    _fft_schema = schema['properties']['_fft']
    arxiv_eprints_schema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': ['hep-th'],
                'value': 'hep-th/9711200',
            },
        ],
    }
    extra_data = {}
    files = MockFiles({
        'foo.pdf':
        AttrDict({
            'obj':
            AttrDict({
                'file': AttrDict({
                    'uri': '/data/foo.pdf',
                }),
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert prepare_files(obj, eng) is None

    expected_fft = [
        {
            'path': '/data/foo.pdf',
            'type': 'arXiv',
            'filename': 'arxiv:foo',
            'format': '.pdf',
        },
    ]
    expected_arxiv_eprints = [
        {
            'categories': [
                'hep-th',
            ],
            'value': 'hep-th/9711200',
        },
    ]
    result = obj.data

    assert validate(result['_fft'], _fft_schema) is None
    assert expected_fft == result['_fft']

    assert validate(result['arxiv_eprints'], arxiv_eprints_schema) is None
    assert expected_arxiv_eprints == result['arxiv_eprints']

    expected = 'Non-user PDF files added to FFT.'
    result = obj.log._info.getvalue()

    assert expected == result
Example #12
0
def test_download_file_to_workflow_retries_on_protocol_error():
    with requests_mock.Mocker() as requests_mocker:
        filename = pkg_resources.resource_filename(
            __name__, os.path.join('fixtures', '1605.03844.pdf'))

        requests_mocker.register_uri(
            'GET', 'http://export.arxiv.org/pdf/1605.03844', [
                {
                    'exc': requests.packages.urllib3.exceptions.ProtocolError
                },
                {
                    'body': filename,
                    'status_code': 200
                },
            ])

        data = {}
        extra_data = {}
        files = MockFiles({})

        obj = MockObj(data, extra_data, files=files)

        expected = MockFileObject(key='1605.03844.pdf')
        result = download_file_to_workflow(
            obj, '1605.03844.pdf', 'http://export.arxiv.org/pdf/1605.03844')

        assert expected == result
Example #13
0
def test_arxiv_fulltext_download_logs_on_pdf_not_existing():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1707.02785',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1707.02785.html')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'cs.CV',
                    ],
                    'value': '1707.02785',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert arxiv_fulltext_download(obj, eng) is None

        expected = 'No PDF is available for 1707.02785'
        result = obj.log._info.getvalue()

        assert expected == result
Example #14
0
def test_arxiv_package_download_logs_on_error():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/e-print/1605.03951',
            status_code=500,
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'astro-ph.HE',
                    ],
                    'value': '1605.03951',
                },
            ],
        }  # literature/1458254
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert arxiv_package_download(obj, eng) is None

        expected = 'Cannot retrieve tarball from arXiv for 1605.03951'
        result = obj.log._error.getvalue()

        assert expected == result
Example #15
0
def test_arxiv_fulltext_download_logs_on_success():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'physics.ins-det',
                    ],
                    'value': '1605.03844',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert arxiv_fulltext_download(obj, eng) is None

        expected = 'PDF retrieved from arXiv for 1605.03844'
        result = obj.log._info.getvalue()

        assert expected == result
Example #16
0
def test_arxiv_author_list_handles_multiple_author_xml_files():
    schema = load_schema('hep')
    eprints_subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432142', 'schema': 'CERN'},
            ],
            'full_name': 'Sirunyan, Albert M',
        },
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432143', 'schema': 'CERN'},
            ],
            'full_name': 'Weary, Jake',
        }
    ]
    validate(expected_authors, authors_subschema)

    assert obj.data.get('authors') == expected_authors
Example #17
0
def test_arxiv_plot_extract_populates_files_with_plots(mock_os, tmpdir):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '0804.1873.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'nucl-ex',
                ],
                'value': '0804.1873',
            },
        ],
    }  # literature/783246
    extra_data = {}
    files = MockFiles({
        '0804.1873.tar.gz':
        AttrDict({
            'file': AttrDict({
                'uri': filename,
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    temporary_dir = tmpdir.mkdir('plots')
    mock_os.path.abspath.return_value = str(temporary_dir)

    assert arxiv_plot_extract(obj, eng) is None

    expected = [{
        'url':
        '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/figure1.png',
        'source':
        'arxiv',
        'material':
        'preprint',
        'key':
        'figure1.png',
        'caption':
        'Difference (in MeV) between the theoretical and experimental masses for the 2027 selected nuclei as a function of the mass number.'
    }]
    result = obj.data['figures']

    assert expected == result

    expected = 'Added 1 plots.'
    result = obj.log._info.getvalue()

    assert expected == result
Example #18
0
def test_submission_fulltext_download_does_not_duplicate_documents():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['acquisition_source']
        data = {
            'acquisition_source': {
                'datetime': '2017-11-30T16:38:43.352370',
                'email': '*****@*****.**',
                'internal_uid': 54252,
                'method': 'submitter',
                'orcid': '0000-0002-2174-4493',
                'source': 'submitter',
                'submission_number': '1'
            }
        }
        assert validate(data['acquisition_source'], subschema) is None

        extra_data = {
            'submission_pdf': 'http://export.arxiv.org/pdf/1605.03844',
        }
        files = MockFiles({})
        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert submission_fulltext_download(obj, eng)
        assert submission_fulltext_download(obj, eng)

        expected_key = 'fulltext.pdf'
        expected_documents = [{
            'fulltext':
            True,
            'key':
            expected_key,
            'original_url':
            'http://export.arxiv.org/pdf/1605.03844',
            'source':
            'submitter',
            'url':
            '/api/files/%s/%s' % (
                obj.files[expected_key].bucket_id,
                expected_key,
            ),
        }]
        result = obj.data['documents']

        assert expected_documents == result
Example #19
0
def test_populate_arxiv_document_retries_on_error():
    response500 = {'content': '', 'status_code': 500}
    response200 = {
        'content': pkg_resources.resource_string(
            __name__, os.path.join('fixtures', '1605.03814.pdf')),
        'status_code': 200,
    }
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.get(
            'http://export.arxiv.org/pdf/1605.03814',
            (response500, response200),
        )
        requests_mocker.get(
            'http://arxiv.org/pdf/1605.03814',
            (response500,)
        )
        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'hep-ex',
                    ],
                    'value': '1605.03814',
                },
            ],
        }  # literature/1458270
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_arxiv_document(obj, eng) is None

        expected_url = 'http://export.arxiv.org/pdf/1605.03814'
        expected_documents = [
            {
                'key': '1605.03814.pdf',
                'fulltext': True,
                'hidden': True,
                'material': 'preprint',
                'original_url': expected_url,
                'url': expected_url,
                'source': 'arxiv',
            }
        ]
        documents = obj.data['documents']
        assert expected_documents == documents
Example #20
0
def test_arxiv_plot_extract_populates_files_with_plots(mock_os):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '0804.1873.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'nucl-ex',
                ],
                'value': '0804.1873',
            },
        ],
    }  # literature/783246
    extra_data = {}
    files = MockFiles({
        '0804.1873.tar.gz':
        AttrDict({
            'file': AttrDict({
                'uri': filename,
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert arxiv_plot_extract(obj, eng) is None

        expected = obj.files['figure1']['description']
        result = ('00000 Difference (in MeV) between the theoretical and '
                  'experimental masses for the 2027 selected nuclei as a '
                  'function of the mass number.')

        assert expected == result

        expected = 'Added 1 plots.'
        result = obj.log._info.getvalue()

        assert expected == result
    finally:
        rmtree(temporary_dir)
Example #21
0
def test_get_document_in_workflow():
    data = {
        'documents': [
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
def test_copy_file_to_workflow(mock_fsopen):
    mock_fsopen.return_value = 'jessica jones'

    data = {}
    extra_data = {}
    files = MockFiles({})

    obj = MockObj(data, extra_data, files=files)

    expected = MockFileObject(key='jessicajones.defenders;1')
    result = copy_file_to_workflow(obj, 'jessicajones.defenders;1',
                                   'file://jessicajones.defenders%3B1')

    assert expected == result
    mock_fsopen.assert_called_once_with('file://jessicajones.defenders;1',
                                        mode='rb')
Example #23
0
def test_populate_arxiv_document_does_not_duplicate_files_if_called_multiple_times(
):
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'physics.ins-det',
                    ],
                    'value': '1605.03844',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_arxiv_document(obj, eng) is None
        assert populate_arxiv_document(obj, eng) is None

        expected = [
            {
                'key': '1605.03844.pdf',
                'fulltext': True,
                'hidden': True,
                'material': 'preprint',
                'original_url': 'http://export.arxiv.org/pdf/1605.03844',
                'url': 'http://export.arxiv.org/pdf/1605.03844',
                'source': 'arxiv',
            },
        ]
        result = obj.data['documents']

        assert expected == result
Example #24
0
def test_arxiv_author_list_does_not_produce_latex():
    schema = load_schema('hep')

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1802.03388.tar.gz'))

    eprints_subschema = schema['properties']['arxiv_eprints']
    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1802.03388',
            },
        ],
    }
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})})
    })

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{'value': 'Lund U.'}],
            'ids': [
                {
                    'value': 'INSPIRE-00061248',
                    'schema': 'INSPIRE ID'
                }
            ],
            'full_name': u'Ã…kesson, Torsten Paul Ake'
        },
    ]
    validate(expected_authors, authors_subschema)

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
    assert obj.data.get('authors') == expected_authors
Example #25
0
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz':
        AttrDict({
            'file':
            AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1605.07707',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert default_arxiv_author_list(obj, eng) is None

        expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707'
        result = obj.log._error.getvalue()

        assert expected == result
    finally:
        rmtree(temporary_dir)
Example #26
0
def test_populate_arxiv_document_logs_on_pdf_not_existing():
    response500 = {'content': '', 'status_code': 500}
    response200 = {
        'content':
        pkg_resources.resource_string(
            __name__, os.path.join('fixtures', '1707.02785.html')),
        'status_code':
        200,
    }
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.get(
            'http://export.arxiv.org/pdf/1707.02785',
            (response200, ),
        )
        requests_mocker.get(
            'http://arxiv.org/pdf/1707.02785',
            (response500, ),
        )
        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'cs.CV',
                    ],
                    'value': '1707.02785',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_arxiv_document(obj, eng) is None

        expected = 'No PDF is available for 1707.02785'
        result = obj.log._info.getvalue()

        assert expected == result
Example #27
0
def test_arxiv_fulltext_download_polulates_documents():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'physics.ins-det',
                    ],
                    'value': '1605.03844',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert arxiv_fulltext_download(obj, eng) is None

        expected = [{
            'fulltext': True,
            'original_url': 'http://export.arxiv.org/pdf/1605.03844',
            'url':
            '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf',
            'material': 'preprint',
            'source': 'arxiv',
            'key': '1605.03844.pdf',
            'hidden': True
        }]
        result = obj.data['documents']

        assert expected == result
Example #28
0
def test_get_document_in_workflow_prefers_fulltext():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
def test_prepare_files_does_nothing_when_obj_has_no_files():
    data = {}
    extra_data = {}
    files = MockFiles({})

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert prepare_files(obj, eng) is None

    expected = {}
    result = obj.data

    assert expected == result

    expected = ''
    result = obj.log._info.getvalue()

    assert expected == result
def test_arxiv_plot_extract_handles_duplicate_plot_names(mock_os):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1711.10662.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'cs.CV',
                ],
                'value': '1711.10662',
            },
        ],
    }  # holdingpen/807096
    extra_data = {}
    files = MockFiles({
        '1711.10662.tar.gz':
        AttrDict({
            'file': AttrDict({
                'uri': filename,
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert arxiv_plot_extract(obj, eng) is None

        assert len(obj.data['figures']) == 66
        assert len(obj.files.keys) == 67

    finally:
        rmtree(temporary_dir)