Example #1
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'),
        )
    ).next()
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_conference_paper_page_body,
        **{'encoding': 'utf-8'}
    )
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()
Example #2
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'arxiv/sample_arxiv_record.xml',
                response_type=TextResponse,
            )))

    assert parsed_items
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Example #3
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'), )).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_conference_paper_page_body,
                            **{'encoding': 'utf-8'})
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()
Example #4
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_processed_item(item, spider):
        record = pipeline.process_item(item, spider)
        validate(record, 'hep')
        assert record
        return record

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Example #5
0
def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield

    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/DESY')
Example #6
0
def remove_generated_files(package_location):
    clean_dir(path='/tmp/WSP/')

    _, dirs, files = next(os.walk(package_location))
    for dir_name in dirs:
        clean_dir(os.path.join(package_location, dir_name))
    for file_name in files:
        if not file_name.endswith('.zip'):
            os.unlink(os.path.join(package_location, file_name))
Example #7
0
def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield
    s3 = s3_connection(**S3_CONFIG)
    for bucket in s3.buckets.all():
        for key in bucket.objects.all():
            key.delete()
        bucket.delete()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
Example #8
0
def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    settings = get_settings()['CRAWLER_SETTINGS']
    settings['buckets'] = ['downloaded']
    setup_s3_buckets(**settings)
    time.sleep(10)
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_buckets(**settings)
Example #9
0
def json_spider_record(tmpdir):
    from scrapy.http import TextResponse
    spider = arxiv_spider.ArxivSpider()
    items = spider.parse(
        fake_response_from_file(
            'arxiv/sample_arxiv_record10.xml',
            response_type=TextResponse,
        ),
    )
    parsed_record = items.next()
    assert parsed_record
    yield spider, parsed_record

    clean_dir()
Example #10
0
def json_spider_record(tmpdir):
    from scrapy.http import TextResponse
    spider = arxiv_spider.ArxivSpider()
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record10.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    items = (spider.parse_record(sel) for sel in test_selectors)
    parsed_record = items.next()
    assert parsed_record
    yield spider, parsed_record

    clean_dir()
Example #11
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'cds',
        'fixtures',
        'oai_harvested',
        'cds_smoke_records.xml',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()
Example #12
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'cds',
        'fixtures',
        'oai_harvested',
        'cds_smoke_records.xml',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()
Example #13
0
def record(scrape_pos_page_body):
    """Return results generator from the PoS spider."""
    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file('pos/sample_pos_record.xml')).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_page_body,
                            **{'encoding': 'utf-8'})
    assert response
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response)
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()
Example #14
0
def set_up_local_environment():
    package_location = get_test_suite_path(
        'arxiv',
        'fixtures',
        'oai_harvested',
        'arxiv_smoke_record.xml',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 5 seconds).
    sleep(5)

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()
Example #15
0
def set_up_ftp_environment():
    netrc_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 10 seconds).
    sleep(10)

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }

    clean_dir(path='/tmp/WSP/')
Example #16
0
def record():
    """Return results generator from the crossref spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
    spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
    fake_response = fake_response_from_file(
        'crossref/sample_crossref_record.json',
        response_type=TextResponse,
    )
    parsed_items = spider.parse(fake_response)

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield _get_record_from_processed_item(parsed_items, spider)

    clean_dir()
Example #17
0
def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/DESY')
Example #18
0
def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/DESY')
Example #19
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )
    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items]

    clean_dir()
Example #20
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Example #21
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        return crawl_result['record']

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Example #22
0
def remove_generated_files(package_location):
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

    _, dirs, files = next(os.walk(package_location))
    for dir_name in dirs:
        clean_dir(os.path.join(package_location, dir_name))
    for file_name in files:
        if not file_name.endswith('.zip'):
            os.unlink(os.path.join(package_location, file_name))
Example #23
0
def remove_generated_files(package_location):
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

    _, dirs, files = next(os.walk(package_location))
    for dir_name in dirs:
        clean_dir(os.path.join(package_location, dir_name))
    for file_name in files:
        if not file_name.endswith('.zip'):
            os.unlink(os.path.join(package_location, file_name))
Example #24
0
def cleanup():
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/WSP')
Example #25
0
def cleanup():
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    yield
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
Example #26
0
def cleanup():
    yield
    clean_dir()
Example #27
0
def cleanup():
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/WSP')
Example #28
0
def cleanup():
    yield
    clean_dir('/tmp/last_runs/')
Example #29
0
def cleanup():
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    yield
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
Example #30
0
def cleanup():
    yield
    clean_dir('/tmp/last_runs/')
def cleanup():
    yield
    clean_dir()