def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), ) ).next() response = HtmlResponse( url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'} ) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() crawl_result = pipeline.process_item(parsed_item, spider) assert crawl_result['record'] yield crawl_result['record'] clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): record = pipeline.process_item(item, spider) return record parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ))) assert parsed_items pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), )).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record yield parsed_record clean_dir()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_processed_item(item, spider): record = pipeline.process_item(item, spider) validate(record, 'hep') assert record return record crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_item(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def cleanup(): # The test must wait until the docker environment is up (takes about 10 # seconds). sleep(10) yield clean_dir('/tmp/file_urls') clean_dir('/tmp/DESY')
def remove_generated_files(package_location): clean_dir(path='/tmp/WSP/') _, dirs, files = next(os.walk(package_location)) for dir_name in dirs: clean_dir(os.path.join(package_location, dir_name)) for file_name in files: if not file_name.endswith('.zip'): os.unlink(os.path.join(package_location, file_name))
def cleanup(): # The test must wait until the docker environment is up (takes about 10 # seconds). sleep(10) yield s3 = s3_connection(**S3_CONFIG) for bucket in s3.buckets.all(): for key in bucket.objects.all(): key.delete() bucket.delete() clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
def cleanup(): # The test must wait until the docker environment is up (takes about 10 # seconds). settings = get_settings()['CRAWLER_SETTINGS'] settings['buckets'] = ['downloaded'] setup_s3_buckets(**settings) time.sleep(10) yield clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) clean_buckets(**settings)
def json_spider_record(tmpdir): from scrapy.http import TextResponse spider = arxiv_spider.ArxivSpider() items = spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record10.xml', response_type=TextResponse, ), ) parsed_record = items.next() assert parsed_record yield spider, parsed_record clean_dir()
def json_spider_record(tmpdir): from scrapy.http import TextResponse spider = arxiv_spider.ArxivSpider() fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record10.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') items = (spider.parse_record(sel) for sel in test_selectors) parsed_record = items.next() assert parsed_record yield spider, parsed_record clean_dir()
def set_up_local_environment(): package_location = get_test_suite_path( 'cds', 'fixtures', 'oai_harvested', 'cds_smoke_records.xml', test_suite='functional', ) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, } } clean_dir()
def record(scrape_pos_page_body): """Return results generator from the PoS spider.""" crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file('pos/sample_pos_record.xml')).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response) parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record yield parsed_record clean_dir()
def set_up_local_environment(): package_location = get_test_suite_path( 'arxiv', 'fixtures', 'oai_harvested', 'arxiv_smoke_record.xml', test_suite='functional', ) # The test must wait until the docker environment is up (takes about 5 seconds). sleep(5) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, } } clean_dir()
def set_up_ftp_environment(): netrc_location = get_test_suite_path( 'wsp', 'fixtures', 'ftp_server', '.netrc', test_suite='functional', ) # The test must wait until the docker environment is up (takes about 10 seconds). sleep(10) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'ftp_host': 'ftp_server', 'ftp_netrc': netrc_location, } } clean_dir(path='/tmp/WSP/')
def record(): """Return results generator from the crossref spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=crossref_spider.CrossrefSpider) spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi') fake_response = fake_response_from_file( 'crossref/sample_crossref_record.json', response_type=TextResponse, ) parsed_items = spider.parse(fake_response) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield _get_record_from_processed_item(parsed_items, spider) clean_dir()
def cleanup(): # The test must wait until the docker environment is up (takes about 10 # seconds). sleep(10) yield clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) clean_dir('/tmp/file_urls') clean_dir('/tmp/DESY')
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items] clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): record = pipeline.process_item(item, spider) return record fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): crawl_result = pipeline.process_item(item, spider) return crawl_result['record'] fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def remove_generated_files(package_location): clean_dir() clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) _, dirs, files = next(os.walk(package_location)) for dir_name in dirs: clean_dir(os.path.join(package_location, dir_name)) for file_name in files: if not file_name.endswith('.zip'): os.unlink(os.path.join(package_location, file_name))
def cleanup(): yield clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) clean_dir('/tmp/file_urls') clean_dir('/tmp/WSP')
def cleanup(): clean_dir() clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) yield clean_dir() clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
def cleanup(): yield clean_dir()
def cleanup(): yield clean_dir('/tmp/last_runs/')