Ejemplo n.º 1
0
def non_url():
    """Parse the node without any links. Should
    take straight to `build_item` and build the HEPRecord.
    """
    spider = t2k_spider.T2kSpider()
    response = fake_response_from_file('t2k/test_1_nourl.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)

    return spider.parse_node(response, nodes[0]).next()
Ejemplo n.º 2
0
def record():
    """Return results from the T2K spider."""
    spider = t2k_spider.T2kSpider()
    response = fake_response_from_file('t2k/test_1.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)
    spider.domain = "file:///tests/responses/t2k/"
    parsed_node = spider.parse_node(response, nodes[0])

    splash_response = fake_response_from_file('t2k/001.html')
    splash_response.meta["date"] = parsed_node.meta["date"]
    splash_response.meta["title"] = parsed_node.meta["title"]
    splash_response.meta["urls"] = parsed_node.meta["urls"]
    splash_response.meta["authors"] = parsed_node.meta["authors"]

    return spider.scrape_for_pdf(splash_response).next()