コード例 #1
0
    def test_value_scraper(self):
        page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
        page1 = Page(page1_html)

        page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
        page2 = Page(page2_html)

        vs = ValueScraper(CssRuleSelector(".test"), TextValueExtractor())
        assert vs.get(page1) == "test"
        assert vs.get(page2) == "hallo"
コード例 #2
0
def test_make_matcher_for_samples():
    page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
    page1 = Page(page1_html)
    sample1 = Sample(page1, "test")

    page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
    page2 = Page(page2_html)
    sample2 = Sample(page2, "hallo")

    samples = [sample1, sample2]
    assert make_matcher_for_samples(samples).selector.css_rule in ["p.test", ".test"]
コード例 #3
0
def test_generate_selector_for_nodes():
    page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
    page1 = Page(page1_html)
    sample1 = Sample(page1, "test")

    page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
    page2 = Page(page2_html)
    sample2 = Sample(page2, "hallo")

    samples = [sample1, sample2]

    nodes = [s.get_matches()[0].get_root() for s in samples]
    gen = generate_selector_for_nodes(nodes, None)
    # todo .test is also possible
    assert ["p.test"] == [sel.css_rule for sel in gen]
コード例 #4
0
    def test_get_matches_list_of_dicts(self):
        page_html = (
            "<html><body>"
            '<div><p class="title">Herr</p><p class="name">Lorey</p></div> '
            '<div><p class="title">Frau</p><p class="name">Müller</p></div> '
            "</body></html>")
        page = Page(page_html)
        sample = Sample(
            page,
            [{
                "title": "Herr",
                "name": "Lorey"
            }, {
                "title": "Frau",
                "name": "Müller"
            }],
        )
        matches = sample.get_matches()

        # check that matches returns one possible list match
        assert len(matches) == 1

        # check that matched list item is dict
        match = matches[0]
        assert isinstance(match, ListMatch)
        assert len(match.matches) == 2
        assert all(isinstance(m, DictMatch) for m in match.matches)
        print(match.get_root())
        print(match.get_span())
コード例 #5
0
    def test_get_matches_list_basic(self):
        item_htmls = map(lambda i: f"<li>{i}</li>", [1, 2, 2, 4])
        body_html = f"<ul>{''.join(item_htmls)}</ul>"
        page_html = f"<html><body>{body_html}</body></html>"
        page = Page(page_html)
        sample = Sample(page, ["1", "2", "2", "4"])
        matches = sample.get_matches()

        # todo check duplicate generation
        # assert len(matches) == 2
        assert all(isinstance(m, ListMatch) for m in matches)
コード例 #6
0
    def test_scrape_matches(self):
        item = {"h": "no 1", "t": "the first one"}

        elem_temp = "<div><h1>%(h)s</h1><p>%(t)s</p></div>"
        elem = elem_temp % item
        html = f"<html><body>{elem}</body></html>"
        page = Page(html)
        text_extractor = TextValueExtractor()
        ds = DictScraper(
            scraper_per_key={
                "h": ValueScraper(CssRuleSelector("h1"), text_extractor),
                "t": ValueScraper(CssRuleSelector("p"), text_extractor),
            })
        assert ds.get(page) == item
コード例 #7
0
def stackoverflow_training_set():
    with open("tests/static/so.html") as file:
        page = Page(file.read())

    item = [
        {
            "user": "******",
            "upvotes": "20",
            "when": "2011-06-16 19:45:11Z",
        },
        {
            "user": "******",
            "upvotes": "16",
            "when": "2017-09-06 15:27:16Z",
        },
        {
            "user": "******",
            "upvotes": "0",
            "when": "2021-01-06 10:50:04Z",
        },
    ]
    return make_training_set([page], [item])
コード例 #8
0
def stackoverflow_samples():
    with open("tests/static/so.html") as file:
        page = Page(file.read())

    item = [
        {
            "user": "******",
            "upvotes": "20",
            "when": "2011-06-16 19:45:11Z",
        },
        {
            "user": "******",
            "upvotes": "16",
            "when": "2017-09-06 15:27:16Z",
        },
        {
            "user": "******",
            "upvotes": "0",
            "when": "2021-01-06 10:50:04Z",
        },
    ]
    samples = [Sample(page, item)]
    return samples
コード例 #9
0
def main():
    with open("tests/static/so.html") as file:
        page = Page(file.read())

    item = [
        {
            "user": "******",
            "upvotes": "20",
            "when": "2011-06-16 19:45:11Z",
        },
        {
            "user": "******",
            "upvotes": "16",
            "when": "2017-09-06 15:27:16Z",
        },
        {
            "user": "******",
            "upvotes": "0",
            "when": "2021-01-06 10:50:04Z",
        },
    ]
    ts = make_training_set([page], [item])
    scraper = train_scraper(ts.item)
    print(scraper)
コード例 #10
0
 def select_one(self, page: Page):
     return page.select(self.css_rule)[0]
コード例 #11
0
def test_generate_css_selectors_for_samples():
    with open("tests/static/so.html") as file:
        page = Page(file.read())
    samples = [Sample(page, ["20", "16", "0"])]
    selector_first = next(generate_matchers_for_samples(samples=samples))
    assert selector_first.endswith(".js-vote-count")
コード例 #12
0
 def test_make_training_set(self):
     pages = [Page(""), Page("")]
     items = [{"a": "1", "b": "2"}, {"a": "3", "b": "4"}]
     make_training_set(pages, items)
コード例 #13
0
 def test_get_matches_dict_basic(self):
     page_html = "<html><body><h1>test</h1><p>2010</p><div class='footer'>2010</div></body></html>"
     s = Sample(Page(page_html), {"h": "test", "year": "2010"})
     matches = s.get_matches()
     assert len(matches) == 2
     assert all(isinstance(m, DictMatch) for m in matches)
コード例 #14
0
 def test_make_training_set_error(self):
     pages = [Page(""), Page("")]
     items = [{"a": "1", "b": "2"}, {"a": "3", "b": []}]
     with pytest.raises(ItemStructureException):
         make_training_set(pages, items)
コード例 #15
0
 def test_find_all(self):
     with open("tests/static/so.html") as file:
         page = Page(file.read())
     nodes = page.find_all("/users/624900/jterrace")
     assert nodes
コード例 #16
0
 def test_something(self):
     with open("tests/static/so.html") as file:
         page = Page(file.read())
     nodes = page.select(".answer .js-vote-count")
     assert [n.text for n in nodes] == ["20", "16", "0"]