Example #1
0
def test_no_spash_page():
    """Test that when url was found but could not be reached, build the
    record with the available data.
    """
    spider = magic_spider.MagicSpider()
    body = """
    <html>
        <body id="f1d">
            <table class="list" style="margin-left: 20px; width: 920px;">
                <tr class="odd">
                    <td>
                    <a href="http://non_reachable_url/">Limits to the violation of...</a>
                    </td>
                </tr>
            </table>
        </body>
    </html>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, spider.itertag, text=body)
    parsed_node = spider.parse_node(response, node)

    response.status = 404
    response.meta["title"] = parsed_node.meta["title"]
    response.meta["urls"] = parsed_node.meta["urls"]
    record = spider.scrape_for_pdf(response).next()

    assert isinstance(record, hepcrawl.items.HEPRecord)
    assert "urls" in record
    assert "title" in record
    assert record["urls"][0]["value"] == "http://non_reachable_url/"
    assert record["title"] == "Limits to the violation of..."
Example #2
0
def test_no_author_no_date_no_url():
    """Parse the node in the listing without author, date, or url. Should
    take straight to `build_item` and build the HEPRecord.
    """
    spider = magic_spider.MagicSpider()
    body = """
    <html>
        <body id="f1d">
            <table class="list" style="margin-left: 20px; width: 920px;">
                <tr class="odd">
                    <td><a>Limits to the violation of...</a></td>
                </tr>
            </table>
        </body>
    </html>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, spider.itertag, text=body)
    parsed_item = spider.parse_node(response, node).next()
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert isinstance(record, hepcrawl.items.HEPRecord)
    assert "date" not in record
    assert "authors" not in record
Example #3
0
def record():
    """Return results from the MAGIC spider. First parse node, then scrape,
    and finally build the record.
    """
    spider = magic_spider.MagicSpider()
    response = fake_response_from_file('magic/test_1.html')
    selector = Selector(response, type='html')
    node = selector.xpath('//%s' % spider.itertag)[0]
    spider.domain = "file:///tests/responses/magic/"
    parsed_node = spider.parse_node(response, node)

    splash_response = fake_response_from_file('magic/test_splash.html')
    splash_response.meta["date"] = parsed_node.meta["date"]
    splash_response.meta["urls"] = parsed_node.meta["urls"]

    return spider.scrape_for_pdf(splash_response).next()
Example #4
0
def test_no_aff():
    """Test the result of calling `scrape_for_pdf` without author
    affiliation. Should be a HEPRecord."""
    spider = magic_spider.MagicSpider()
    body = """
    <html>
    <div id="content">
        <h3 class="pub_title">Limits to the violation of Lorentz...</h3>
        <p class="author">Daniel Garrido Terrats</p>
    </div>
    </html>
    """
    response = fake_response_from_string(body)
    record = spider.scrape_for_pdf(response).next()

    assert isinstance(record, hepcrawl.items.HEPRecord)
    assert "date" not in record
    assert "affiliations" not in record["authors"]