Ejemplo n.º 1
0
def erratum_open_access_record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PublisherName>Institute of Physics</PublisherName>
                <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle>
                <Volume>143</Volume>
                <Issue>3</Issue>
            </Journal>
            <FirstPage LZero="save">336</FirstPage>
        <PublicationType>Published Erratum</PublicationType>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Ejemplo n.º 2
0
def get_parsed_from_file(filename):
    """A dictionary holding the parsed elements of the record."""
    path = get_test_suite_path('responses', 'crossref', filename)
    with open(path) as f:
        aps_dict = yaml.load(f)

    return aps_dict
Ejemplo n.º 3
0
def get_local_settings_for_broken():
    package_location = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        'DESY',
        'broken',
        test_suite='functional',
    )
    os.mkdir(package_location)
    tmp_file = os.path.join(package_location, 'broken_record.xml')

    with open(tmp_file, 'w') as f:
        f.write("<?xml version='1.0' encoding='UTF-8'?>"
                "<collection>"
                "<record>"
                "<datafield tag='260' ind1=' ' ind2=' '>"
                "<subfield code='c'>BROKEN DATE</subfield>"
                "</datafield>"
                "</record>"
                "</collection>")

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_folder': package_location,
        }
    }
    shutil.rmtree(package_location)
Ejemplo n.º 4
0
def get_parser_by_file(filename):
    """A CrossrefParser instanciated on an crossref API response."""
    path = get_test_suite_path('responses', 'crossref', filename)
    with open(path) as f:
        aps_crossref = json.load(f)

    return CrossrefParser(aps_crossref)
Ejemplo n.º 5
0
def setup_s3():
    test_file_path = get_test_suite_path(
        "elsevier",
        "fixtures",
        "elsevier",
        test_suite="functional",
    )

    s3 = establish_s3_connection()
    packages_bucket = get_bucket(s3, CRAWLER_ARGS["packages_bucket_name"])
    articles_bucket = get_bucket(s3, CRAWLER_ARGS["files_bucket_name"])
    mock_elsevier_bucket = get_bucket(s3, "batch-feed")
    downloaded_files_bucket = get_bucket(s3, "downloaded")

    packages_bucket.create()
    articles_bucket.create()
    mock_elsevier_bucket.create()
    downloaded_files_bucket.create()

    mock_elsevier_bucket.upload_file(os.path.join(test_file_path,
                                                  "test_zip_file.ZIP"),
                                     "test_zip_file.ZIP",
                                     ExtraArgs={'ACL': 'public-read'})

    mock_elsevier_bucket.upload_file(os.path.join(
        test_file_path, "test_zip_file_replicated.ZIP"),
                                     "test_zip_file.ZIP",
                                     ExtraArgs={'ACL': 'public-read'})

    mock_elsevier_bucket.upload_file(
        os.path.join(test_file_path, "wrong_articles.ZIP"),
        "wrong_articles.ZIP",
    )
Ejemplo n.º 6
0
def get_parsed_from_file(filename):
    """A dictionary holding the parsed elements of the record."""
    path = get_test_suite_path('responses', 'elsevier', filename)
    with open(path) as f:
        elsevier_expected_dict = yaml.load(f)

    return elsevier_expected_dict
Ejemplo n.º 7
0
def get_parser_by_file(filename):
    """A ElsevierParser instanciated on an APS article."""
    path = get_test_suite_path('responses', 'elsevier', filename)
    with open(path) as f:
        aps_elsevier = f.read()

    return ElsevierParser(aps_elsevier)
Ejemplo n.º 8
0
def get_parser_by_file(filename):
    """A JatsParser instanciated on an APS article."""
    path = get_test_suite_path('responses', 'aps', filename)
    with open(path) as f:
        aps_jats = f.read()

    return JatsParser(aps_jats)
Ejemplo n.º 9
0
def get_local_settings_for_broken():
    package_location = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        'DESY',
        'broken',
        test_suite='functional',
    )
    os.mkdir(package_location)
    tmp_file = os.path.join(package_location, 'broken_record.xml')

    with open(tmp_file, 'w') as f:
        f.write(
            "<?xml version='1.0' encoding='UTF-8'?>"
            "<collection>"
            "<record>"
            "<datafield tag='260' ind1=' ' ind2=' '>"
            "<subfield code='c'>BROKEN DATE</subfield>"
            "</datafield>"
            "</record>"
            "</collection>"
        )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_folder': package_location,
        }
    }
    shutil.rmtree(package_location)
Ejemplo n.º 10
0
def erratum_open_access_record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PublisherName>Institute of Physics</PublisherName>
                <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle>
                <Volume>143</Volume>
                <Issue>3</Issue>
            </Journal>
            <FirstPage LZero="save">336</FirstPage>
        <PublicationType>Published Erratum</PublicationType>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Ejemplo n.º 11
0
def tarfile():
    """Return path to test tar.gz file."""
    return get_test_suite_path(
        'responses',
        'iop',
        'packages',
        'test.tar.gz',
    )
Ejemplo n.º 12
0
def tarfile():
    """Return path to test tar.gz file."""
    return get_test_suite_path(
        'responses',
        'iop',
        'packages',
        'test.tar.gz',
    )
Ejemplo n.º 13
0
def load_file(file_name):
    path = get_test_suite_path(
        'responses',
        'tohep',
        file_name,
    )
    with open(path) as input_data:
        data = yaml.load(input_data.read())

    return data
Ejemplo n.º 14
0
def load_file(file_name):
    path = get_test_suite_path(
        'responses',
        'tohep',
        file_name,
    )
    with open(path) as input_data:
        data = yaml.load(input_data.read())

    return data
Ejemplo n.º 15
0
def get_file_name_from_documents(documents_field):
    file_path = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        'DESY',
        'FFT',
        documents_field['key'],
        test_suite='functional',
    )
    return file_path
Ejemplo n.º 16
0
def get_file_name_from_fft(fft_field):
    file_path = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        'DESY',
        'FFT',
        fft_field['filename'] + fft_field['format'],
        test_suite='functional',
    )
    return file_path
Ejemplo n.º 17
0
def get_local_settings():
    package_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        'WSP',
        test_suite='functional',
    )

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'local_package_dir': package_location,
        }
    }
Ejemplo n.º 18
0
def get_local_settings():
    package_location = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        'DESY',
        test_suite='functional',
    )

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_folder': package_location,
        }
    }
Ejemplo n.º 19
0
def get_ftp_settings():
    netrc_location = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }
Ejemplo n.º 20
0
def get_ftp_settings():
    netrc_location = get_test_suite_path(
        'desy',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }
Ejemplo n.º 21
0
def get_local_settings():
    package_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        'WSP',
        test_suite='functional',
    )

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'local_package_dir': package_location,
            'destination_folder': "/code/.tmp/WSP"
        }
    }
Ejemplo n.º 22
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'cds',
        'fixtures',
        'oai_harvested',
        'cds_smoke_records.xml',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()
Ejemplo n.º 23
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        'WSP',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'package_path': package_location,
        }
    }

    remove_generated_files(package_location)
Ejemplo n.º 24
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'cds',
        'fixtures',
        'oai_harvested',
        'cds_smoke_records.xml',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()
Ejemplo n.º 25
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'arxiv',
        'fixtures',
        'oai_harvested',
        'arxiv_smoke_record.xml',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 5 seconds).
    sleep(5)

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }
Ejemplo n.º 26
0
def get_configuration():
    package_location = get_test_suite_path(
        'pos',
        'fixtures',
        'oai_harvested',
        'pos_record.xml',
        test_suite='functional',
    )

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file':
            'file://' + package_location,
            'base_conference_paper_url':
            ('https://http-server.local/contribution?id='),
            'base_proceedings_url':
            ('https://http-server.local/cgi-bin/reader/conf.cgi?confid='),
        }
    }
Ejemplo n.º 27
0
def get_expected_parser_responses_for_new_articles_in_s3():
    test_file_path = get_test_suite_path(
        "elsevier",
        "fixtures",
        "elsevier",
        "parsed_records",
        test_suite="functional",
    )

    files = [
        "j.geomphys.2020.103898.yml",
        "j.geomphys.2020.103921.yml",
        "j.geomphys.2020.103925.yml",
        "j.geomphys.2020.103892.yml",
    ]
    responses = []
    for file in files:
        responses.append(
            get_parser_response_from_file(os.path.join(test_file_path, file)))

    return responses
Ejemplo n.º 28
0
def get_ftp_settings():
    netrc_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }
Ejemplo n.º 29
0
def get_ftp_settings():
    netrc_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)

    return {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }
Ejemplo n.º 30
0
def set_up_ftp_environment():
    netrc_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 10 seconds).
    sleep(10)

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }

    clean_dir(path='/tmp/WSP/')
Ejemplo n.º 31
0
def setup_s3_files(s3_key,
                   s3_secret,
                   s3_server,
                   buckets=[],
                   files_to_upload=[],
                   files_path=None,
                   *args,
                   **kwargs):
    s3 = s3_connection(s3_key, s3_secret, s3_server)
    buckets_map = {}
    for bucket_name in buckets:
        bucket = s3.Bucket(bucket_name)
        bucket.create()
        buckets_map[bucket_name] = bucket

    test_files_path = get_test_suite_path(*files_path, test_suite='functional')
    transfer_config = TransferConfig(use_threads=False)
    for bucket_name, file_name in files_to_upload:
        buckets_map[bucket_name].upload_file(Filename=os.path.join(
            test_files_path, file_name),
                                             Key=file_name,
                                             Config=transfer_config)
Ejemplo n.º 32
0
def test_not_published_record():
    """Not-published paper should result in nothing."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PubDate PubStatus="aheadofprint">
                    <Year>2015</Year>
                    <Month>03</Month>
                </PubDate>
            </Journal>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )
    records = spider.parse_node(response, node)
    assert records is None
Ejemplo n.º 33
0
def test_not_published_record():
    """Not-published paper should result in nothing."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PubDate PubStatus="aheadofprint">
                    <Year>2015</Year>
                    <Month>03</Month>
                </PubDate>
            </Journal>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )
    records = spider.parse_node(response, node)
    assert records is None
Ejemplo n.º 34
0
import six

import pytest

from hepcrawl.spiders import iop_spider

from hepcrawl.testlib.fixtures import (
    fake_response_from_file,
    fake_response_from_string,
    get_node,
    get_test_suite_path,
)

TEST_PDF_DIR = get_test_suite_path(
    'responses',
    'iop',
    'pdf',
)


@pytest.fixture
def record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    response = fake_response_from_file('iop/xml/test_standard.xml')
    node = get_node(spider, "Article", response)
    spider.pdf_files = TEST_PDF_DIR

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
Ejemplo n.º 35
0
import six

import pytest

from hepcrawl.spiders import iop_spider

from hepcrawl.testlib.fixtures import (
    fake_response_from_file,
    fake_response_from_string,
    get_node,
    get_test_suite_path,
)

TEST_PDF_DIR = get_test_suite_path(
    'responses',
    'iop',
    'pdf',
)


@pytest.fixture
def record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    response = fake_response_from_file('iop/xml/test_standard.xml')
    node = get_node(spider, "Article", response)
    spider.pdf_files = TEST_PDF_DIR

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record