def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), )).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record yield parsed_record clean_dir()
def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), ) ).next() response = HtmlResponse( url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'} ) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response).next() crawl_result = pipeline.process_item(parsed_item, spider) assert crawl_result['record'] yield crawl_result['record'] clean_dir()
def authors(): """Returns get_authors() output.""" spider = phil_spider.PhilSpider() response = fake_response_from_file('phil/test_thesis.json') jsonrecord = json.loads(response.body_as_unicode()) response.meta["jsonrecord"] = jsonrecord[0] return spider.get_authors(jsonrecord[0]['authors'])
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): record = pipeline.process_item(item, spider) return record parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ))) assert parsed_items pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_processed_item(item, spider): record = pipeline.process_item(item, spider) validate(record, 'hep') assert record return record crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ))) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return [ _get_processed_item(parsed_item, spider) for parsed_item in parsed_items ]
def urls(): spider = base_spider.BaseSpider() response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) return spider.get_urls_in_record(nodes[0])
def record(scrape_pos_page_body): """Return the results of the spider.""" spider = dnb_spider.DNBSpider() with requests_mock.Mocker() as mock: mock.head('http://nbn-resolving.de/urn:nbn:de:hebis:30:3-386257', headers={ 'Content-Type': 'text/html', }) mock.head('http://d-nb.info/1079912991/34', headers={ 'Content-Type': 'application/pdf;charset=base64', }) mock.head( 'http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/38625', headers={ 'Content-Type': 'text/html', }) request = spider.parse( fake_response_from_file('dnb/test_1.xml')).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'}) parsed_item = request.callback(response) assert parsed_item assert parsed_item.record return parsed_item.record
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_processed_item(item, spider): record = pipeline.process_item(item, spider) validate(record, 'hep') assert record return record crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_item(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def parse_requests(): """Returns a fake request to the record file. With links. """ spider = phil_spider.PhilSpider() response = fake_response_from_file('phil/test_thesis.json') return spider.parse(response)
def test_results_jats_parser_handle_date_absence(): from scrapy.http import XmlResponse spider = aps_spider.APSSpider(aps_token="secret") fake_response = fake_response_from_file( 'aps/PhysRevD.96.095036_no_date_nodes.xml', response_type=XmlResponse, ) record = spider._parse_jats(fake_response).record assert validate(record, 'hep') is None
def record(): """Return scraping results from the MIT spider.""" spider = mit_spider.MITSpider() response = fake_response_from_file('mit/test_splash.html') parsed_item = spider.build_item(response) assert parsed_item assert parsed_item.record return parsed_item.record
def record(): """Return scraping results from the INFN spider.""" spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_splash.html') parsed_item = spider.scrape_splash(response) assert parsed_item assert parsed_item.record return parsed_item.record
def results(): """Return results generator from the Alpha spider.""" spider = alpha_spider.AlphaSpider() parsed_items = list( spider.parse(fake_response_from_file('alpha/test_1.htm'))) records = [parsed_item.record for parsed_item in parsed_items] assert records return records
def json_spider_record(tmpdir): from scrapy.http import TextResponse spider = arxiv_spider.ArxivSpider() items = spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record10.xml', response_type=TextResponse, ), ) parsed_record = items.next() assert parsed_record return spider, parsed_record
def test_results_from_jats(): """Get and validate results from mocking a JATS response.""" from scrapy.http import XmlResponse spider = aps_spider.APSSpider() fake_response = fake_response_from_file( 'aps/PhysRevD.96.095036.xml', response_type=XmlResponse, ) record = spider._parse_jats(fake_response).record assert validate(record, 'hep') == None
def parsed_node(): """Call parse_node and return its request call.""" spider = mit_spider.MITSpider() response = fake_response_from_file('mit/test_list.html') tag = spider.itertag node = get_node(spider, tag, response, rtype="html") parsed_item = spider.parse_node(response, node).next() assert parsed_item return parsed_item
def record(): """Return results from the MAGIC spider. First parse node, then scrape, and finally build the record. """ spider = magic_spider.MagicSpider() response = fake_response_from_file('magic/test_1.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] spider.domain = "file:///tests/responses/magic/" parsed_node = spider.parse_node(response, node) splash_response = fake_response_from_file('magic/test_splash.html') splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["urls"] = parsed_node.meta["urls"] parsed_item = spider.scrape_for_pdf(splash_response).next() assert parsed_item assert parsed_item.record return parsed_item.record
def test_results_from_jats(): """Get and validate results from mocking a JATS response.""" from scrapy.http import XmlResponse spider = aps_spider.APSSpider(aps_token="secret") fake_response = fake_response_from_file( 'aps/PhysRevD.96.095036.xml', response_type=XmlResponse, ) record = spider._parse_jats(fake_response).record assert validate(record, 'hep') is None
def record(): """Return the results from the Hindawi spider.""" spider = hindawi_spider.HindawiSpider() response = fake_response_from_file("hindawi/test_1.xml") nodes = get_node(spider, "//marc:record", response) parsed_item = spider.parse_node(response, nodes[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def test_error_handler(crawler): """Test ErrorHandler extension.""" handler = ErrorHandler.from_crawler(crawler) response = crawler.spider.parse( fake_response_from_file('world_scientific/sample_ws_record.xml')) assert 'errors' not in crawler.spider.state handler.spider_error("Some failure", response, crawler.spider) assert 'errors' in crawler.spider.state assert crawler.spider.state['errors'][0]["exception"] == "Some failure" assert crawler.spider.state['errors'][0]["sender"] == response
def record(): """Return results from the T2K spider.""" spider = t2k_spider.T2kSpider() response = fake_response_from_file('t2k/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) spider.domain = "file:///tests/responses/t2k/" parsed_node = spider.parse_node(response, nodes[0]) splash_response = fake_response_from_file('t2k/001.html') splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["title"] = parsed_node.meta["title"] splash_response.meta["urls"] = parsed_node.meta["urls"] splash_response.meta["authors"] = parsed_node.meta["authors"] parsed_item = spider.scrape_for_pdf(splash_response).next() assert parsed_item assert parsed_item.record return parsed_item.record
def test_error_handler(crawler): """Test ErrorHandler extension.""" handler = ErrorHandler.from_crawler(crawler) response = crawler.spider.parse(fake_response_from_file( 'world_scientific/sample_ws_record.xml' )) assert 'errors' not in crawler.spider.state handler.spider_error("Some failure", response, crawler.spider) assert 'errors' in crawler.spider.state assert crawler.spider.state['errors'][0]["exception"] == "Some failure" assert crawler.spider.state['errors'][0]["sender"] == response
def record(): """Return results generator from the Alpha spider.""" spider = phenix_spider.PhenixSpider() response = fake_response_from_file('phenix/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) parsed_item = spider.parse_node(response, nodes[0]) assert parsed_item assert parsed_item.record return parsed_item.record
def record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def test_parse_node_nolink(): """Test parse_node function. This time there is no splash page link. The result should be a HEPRecord with minimal data. """ spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_1_nolink.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] parsed_item = spider.parse_node(response, node).next() assert parsed_item assert parsed_item.record assert isinstance(parsed_item.record, hepcrawl.items.HEPRecord)
def record(): """Return results from the Brown spider.""" spider = brown_spider.BrownSpider() with requests_mock.Mocker() as mock: mock.head( 'http://www.example.com/studio/item/bdr:11303/PDF/', headers={ 'Content-Type': 'text/html', }, ) response = fake_response_from_file('brown/test_1.json') jsonresponse = json.loads(response.body_as_unicode()) jsonrecord = jsonresponse["items"]["docs"][0] jsonrecord["uri"] = "brown/test_splash.html" splash_response = fake_response_from_file('brown/test_splash.html') splash_response.meta["jsonrecord"] = jsonrecord parsed_item = spider.scrape_splash(splash_response) assert parsed_item assert parsed_item.record return parsed_item.record
def splash(): """Call web scraper function, return final HEPRecord.""" spider = base_spider.BaseSpider() splash_response = fake_response_from_file('base/test_1_splash.htm') response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) splash_response.meta["record"] = nodes[0].extract() with requests_mock.Mocker() as mock: mock.head( 'http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf', headers={ 'Content-Type': 'text/html', }, ) parsed_item = spider.scrape_for_pdf(splash_response) assert parsed_item assert parsed_item.record return parsed_item.record
def non_url(): """Parse the node without any links. Should take straight to `build_item` and build the HEPRecord. """ spider = t2k_spider.T2kSpider() response = fake_response_from_file('t2k/test_1_nourl.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) parsed_item = spider.parse_node(response, nodes[0]).next() assert parsed_item assert parsed_item.record return parsed_item.record
def json_spider_record(tmpdir): from scrapy.http import TextResponse spider = arxiv_spider.ArxivSpider() fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record10.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') items = (spider.parse_record(sel) for sel in test_selectors) parsed_record = items.next() assert parsed_record yield spider, parsed_record clean_dir()
def journal(): """Return results generator from the Phil spider. Journal specific. """ spider = phil_spider.PhilSpider() response = fake_response_from_file('phil/test_journal.json') jsonrecord = json.loads(response.body_as_unicode()) response.meta["jsonrecord"] = jsonrecord[0] parsed_item = spider.build_item(response) assert parsed_item assert parsed_item.record return parsed_item.record
def record_rich(package_rich): """Return results from the EDP spider with 'rich' format. This is not an open access journal, so no splash scraping. """ spider = edp_spider.EDPSpider() xml_path = package_rich.url.replace("file://", "") fake_resp = fake_response_from_file(xml_path) fake_resp.meta["rich"] = True node = get_node(spider, "//EDPSArticle", fake_resp)[0] parsed_item = spider.parse_node(fake_resp, node) assert parsed_item assert parsed_item.record return parsed_item.record
def get_records(response_file_name): """Return all results generator from the WSP spider via pipelines.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' spider = create_spider() records = spider.parse( fake_response_from_file(file_name=response_file_name, response_type=TextResponse)) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return (pipeline.process_item(record, spider).record for record in records)
def record(): """Return built HEPRecord from the BASE spider.""" spider = base_spider.BaseSpider() response = fake_response_from_file('base/test_1.xml') selector = Selector(response, type='xml') spider._register_namespaces(selector) nodes = selector.xpath('.//%s' % spider.itertag) response.meta["record"] = nodes[0].extract() response.meta["urls"] = ["http://hdl.handle.net/1885/10005"] parsed_item = spider.build_item(response) assert parsed_item assert parsed_item.record return parsed_item.record
def record_jats(package_jats, scrape_pos_page_body): """Return results from the EDP spider with JATS format. This is an open access journal, so we can scrape the splash page. """ spider = edp_spider.EDPSpider() xml_path = package_jats.url.replace("file://", "") fake_resp = fake_response_from_file(xml_path) node = get_node(spider, "//article", fake_resp)[0] request = spider.parse_node(fake_resp, node) response = HtmlResponse( url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) parsed_item = request.callback(response) assert parsed_item assert parsed_item.record return parsed_item.record
def get_records(response_file_name): """Return all results generator from the WSP spider via pipelines.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' spider = create_spider() records = spider.parse( fake_response_from_file( file_name=response_file_name, response_type=TextResponse ) ) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) return ( pipeline.process_item(record, spider)['record'] for record in records )
def results_from_json(): """Return results by parsing a JSON file.""" from scrapy.http import TextResponse spider = aps_spider.APSSpider() parsed_items = list( spider.parse( fake_response_from_file( 'aps/aps_single_response.json', response_type=TextResponse, ) ) ) class MockFailure: """Mock twisted.python.failure.Failure, failure on JATS request.""" def __init__(self): self.request = parsed_items[0] records = [spider._parse_json_on_failure(MockFailure()).record] assert records return records
def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ def _get_processed_record(item, spider): crawl_result = pipeline.process_item(item, spider) return crawl_result['record'] fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_processed_record(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items] clean_dir()