Exemple #1
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'), )).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_conference_paper_page_body,
                            **{'encoding': 'utf-8'})
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()
Exemple #2
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'),
        )
    ).next()
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_conference_paper_page_body,
        **{'encoding': 'utf-8'}
    )
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    from scrapy.http import TextResponse

    records = list(
        spider.parse(
            fake_response_from_file('arxiv/sample_arxiv_record.xml',
                                    response_type=TextResponse)))
    assert records
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    return [pipeline.process_item(record, spider) for record in records]
def record(scrape_pos_page_body):
    """Return results generator from the PoS spider."""
    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file('pos/sample_pos_record.xml')).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_page_body,
                            **{'encoding': 'utf-8'})
    assert response
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    record = request.callback(response)
    return pipeline.process_item(record, spider)
Exemple #5
0
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(file_name=response_file_name,
                                response_type=TextResponse))

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (pipeline.process_item(record, spider).record for record in records)
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(
            file_name=response_file_name,
            response_type=TextResponse
        )
    )

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (
        pipeline.process_item(record, spider)['record']
        for record in records
    )