def test_no_spash_page(): """Test that when url was found but could not be reached, build the record with the available data. """ spider = magic_spider.MagicSpider() body = """ <html> <body id="f1d"> <table class="list" style="margin-left: 20px; width: 920px;"> <tr class="odd"> <td> <a href="http://non_reachable_url/">Limits to the violation of...</a> </td> </tr> </table> </body> </html> """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) parsed_node = spider.parse_node(response, node) response.status = 404 response.meta["title"] = parsed_node.meta["title"] response.meta["urls"] = parsed_node.meta["urls"] record = spider.scrape_for_pdf(response).next() assert isinstance(record, hepcrawl.items.HEPRecord) assert "urls" in record assert "title" in record assert record["urls"][0]["value"] == "http://non_reachable_url/" assert record["title"] == "Limits to the violation of..."
def test_no_author_no_date_no_url(): """Parse the node in the listing without author, date, or url. Should take straight to `build_item` and build the HEPRecord. """ spider = magic_spider.MagicSpider() body = """ <html> <body id="f1d"> <table class="list" style="margin-left: 20px; width: 920px;"> <tr class="odd"> <td><a>Limits to the violation of...</a></td> </tr> </table> </body> </html> """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) parsed_item = spider.parse_node(response, node).next() assert parsed_item assert parsed_item.record record = parsed_item.record assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "authors" not in record
def record(): """Return results from the MAGIC spider. First parse node, then scrape, and finally build the record. """ spider = magic_spider.MagicSpider() response = fake_response_from_file('magic/test_1.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] spider.domain = "file:///tests/responses/magic/" parsed_node = spider.parse_node(response, node) splash_response = fake_response_from_file('magic/test_splash.html') splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["urls"] = parsed_node.meta["urls"] return spider.scrape_for_pdf(splash_response).next()
def test_no_aff(): """Test the result of calling `scrape_for_pdf` without author affiliation. Should be a HEPRecord.""" spider = magic_spider.MagicSpider() body = """ <html> <div id="content"> <h3 class="pub_title">Limits to the violation of Lorentz...</h3> <p class="author">Daniel Garrido Terrats</p> </div> </html> """ response = fake_response_from_string(body) record = spider.scrape_for_pdf(response).next() assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record assert "affiliations" not in record["authors"]