def non_url(): """Parse the node without any links. Should take straight to `build_item` and build the HEPRecord. """ spider = t2k_spider.T2kSpider() response = fake_response_from_file('t2k/test_1_nourl.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) return spider.parse_node(response, nodes[0]).next()
def record(): """Return results from the T2K spider.""" spider = t2k_spider.T2kSpider() response = fake_response_from_file('t2k/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) spider.domain = "file:///tests/responses/t2k/" parsed_node = spider.parse_node(response, nodes[0]) splash_response = fake_response_from_file('t2k/001.html') splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["title"] = parsed_node.meta["title"] splash_response.meta["urls"] = parsed_node.meta["urls"] splash_response.meta["authors"] = parsed_node.meta["authors"] return spider.scrape_for_pdf(splash_response).next()