Exemple #1
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'),
        )
    ).next()
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_conference_paper_page_body,
        **{'encoding': 'utf-8'}
    )
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_processed_item(item, spider):
        record = pipeline.process_item(item, spider)
        validate(record, 'hep')
        assert record
        return record

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Exemple #3
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'), )).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_conference_paper_page_body,
                            **{'encoding': 'utf-8'})
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()
Exemple #4
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'arxiv/sample_arxiv_record.xml',
                response_type=TextResponse,
            )))

    assert parsed_items
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Exemple #5
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_processed_item(item, spider):
        record = pipeline.process_item(item, spider)
        validate(record, 'hep')
        assert record
        return record

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'arxiv/sample_arxiv_record0.xml',
                response_type=TextResponse,
            )))

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return [
        _get_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    from scrapy.http import TextResponse

    records = list(
        spider.parse(
            fake_response_from_file('arxiv/sample_arxiv_record.xml',
                                    response_type=TextResponse)))
    assert records
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    return [pipeline.process_item(record, spider) for record in records]
def record(scrape_pos_page_body):
    """Return results generator from the PoS spider."""
    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file('pos/sample_pos_record.xml')).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_page_body,
                            **{'encoding': 'utf-8'})
    assert response
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    record = request.callback(response)
    return pipeline.process_item(record, spider)
Exemple #8
0
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(file_name=response_file_name,
                                response_type=TextResponse))

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (pipeline.process_item(record, spider).record for record in records)
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(
            file_name=response_file_name,
            response_type=TextResponse
        )
    )

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (
        pipeline.process_item(record, spider)['record']
        for record in records
    )
Exemple #10
0
def record():
    """Return results generator from the crossref spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
    spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
    fake_response = fake_response_from_file(
        'crossref/sample_crossref_record.json',
        response_type=TextResponse,
    )
    parsed_items = spider.parse(fake_response)

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield _get_record_from_processed_item(parsed_items, spider)

    clean_dir()
Exemple #11
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Exemple #12
0
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        return crawl_result['record']

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Exemple #13
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )
    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items]

    clean_dir()