def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), )).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record yield parsed_record clean_dir()
def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), ) ).next() response = HtmlResponse( url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'} ) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() crawl_result = pipeline.process_item(parsed_item, spider) assert crawl_result['record'] yield crawl_result['record'] clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ from scrapy.http import TextResponse records = list( spider.parse( fake_response_from_file('arxiv/sample_arxiv_record.xml', response_type=TextResponse))) assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return [pipeline.process_item(record, spider) for record in records]
def record(scrape_pos_page_body): """Return results generator from the PoS spider.""" crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file('pos/sample_pos_record.xml')).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) record = request.callback(response) return pipeline.process_item(record, spider)
def get_records(response_file_name): """Return all results generator from the WSP spider via pipelines.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' spider = create_spider() records = spider.parse( fake_response_from_file(file_name=response_file_name, response_type=TextResponse)) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return (pipeline.process_item(record, spider).record for record in records)
def get_records(response_file_name): """Return all results generator from the WSP spider via pipelines.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' spider = create_spider() records = spider.parse( fake_response_from_file( file_name=response_file_name, response_type=TextResponse ) ) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return ( pipeline.process_item(record, spider)['record'] for record in records )