Esempio n. 1
0
def test_simple_candidate_set():
    """Tests a simple case of two candidate nodes"""
    html = """
        <html>
        <body>
            <div class="content">
                <p>This is a great amount of info</p>
                <p>And more content <a href="/index">Home</a>
            </div>
            <div class="footer">
                <p>This is a footer</p>
                <p>And more content <a href="/index">Home</a>
            </div>
        </body>
        </html>
    """
    dom = document_fromstring(html)
    div_nodes = dom.findall(".//div")

    candidates = score_candidates(div_nodes)
    ordered = sorted((c for c in candidates.values()),
                     reverse=True,
                     key=attrgetter("content_score"))

    assert ordered[0].node.tag == "div"
    assert ordered[0].node.attrib["class"] == "content"
    assert ordered[1].node.tag == "body"
    assert ordered[2].node.tag == "html"
    assert ordered[3].node.tag == "div"
    assert ordered[3].node.attrib["class"] == "footer"
Esempio n. 2
0
    def test_simple_candidate_set(self):
        """Tests a simple case of two candidate nodes"""
        html = """
            <html>
            <body>
                <div class="content">
                    <p>This is a great amount of info</p>
                    <p>And more content <a href="/index">Home</a>
                </div>
                <div class="footer">
                    <p>This is a footer</p>
                    <p>And more content <a href="/index">Home</a>
                </div>
            </body>
            </html>
        """
        dom = document_fromstring(html)
        div_nodes = dom.findall(".//div")

        candidates = score_candidates(div_nodes)
        ordered = sorted(
            (c for c in candidates.values()), reverse=True,
            key=attrgetter("content_score"))

        self.assertEqual(ordered[0].node.tag, "div")
        self.assertEqual(ordered[0].node.attrib["class"], "content")
        self.assertEqual(ordered[1].node.tag, "body")
        self.assertEqual(ordered[2].node.tag, "html")
        self.assertEqual(ordered[3].node.tag, "div")
        self.assertEqual(ordered[3].node.attrib["class"], "footer")
Esempio n. 3
0
    def test_simple_candidate_set(self):
        """Tests a simple case of two candidate nodes"""
        doc = """
            <html>
            <body>
                <div class="content">
                    <p>This is a great amount of info</p>
                    <p>And more content <a href="/index">Home</a>
                </div>
                <div class="footer">
                    <p>This is a footer</p>
                    <p>And more content <a href="/index">Home</a>
                </div>
            </body>
            </html>
        """
        d_elem = document_fromstring(doc)
        divs = d_elem.findall(".//div")
        f_elem = divs[0]
        s_elem = divs[1]

        res = score_candidates([f_elem, s_elem])
        ordered = sorted([c for c in res.values()],
                          key=attrgetter('content_score'),
                          reverse=True)

        # the body element should have a higher score
        self.assertTrue(ordered[0].node.tag == 'body')

        # the html element is the outer should come in second
        self.assertTrue(ordered[1].node.tag == 'html')
Esempio n. 4
0
    def test_simple_candidate_set(self):
        """Tests a simple case of two candidate nodes"""
        doc = """
            <html>
            <body>
                <div class="content">
                    <p>This is a great amount of info</p>
                    <p>And more content <a href="/index">Home</a>
                </div>
                <div class="footer">
                    <p>This is a footer</p>
                    <p>And more content <a href="/index">Home</a>
                </div>
            </body>
            </html>
        """
        d_elem = document_fromstring(doc)
        divs = d_elem.findall(".//div")
        f_elem = divs[0]
        s_elem = divs[1]

        res = score_candidates([f_elem, s_elem])
        ordered = sorted([c for c in res.values()],
                         key=attrgetter('content_score'),
                         reverse=True)

        # the body element should have a higher score
        self.assertTrue(ordered[0].node.tag == 'body')

        # the html element is the outer should come in second
        self.assertTrue(ordered[1].node.tag == 'html')
Esempio n. 5
0
def find_candidates(doc):
    """Find cadidate nodes for the readable version of the article.

    Here's we're going to remove unlikely nodes, find scores on the rest, and
    clean up and return the final best match.

    """
    scorable_node_tags = SCORABLE_TAGS
    nodes_to_score = []
    should_remove = []

    for node in doc.iter():
        if is_unlikely_node(node):
            LOG.debug("We should drop unlikely: " + str(node))
            should_remove.append(node)
            continue
        if node.tag in scorable_node_tags and node not in nodes_to_score:
            nodes_to_score.append(node)
    return score_candidates(nodes_to_score), should_remove
Esempio n. 6
0
def find_candidates(doc):
    """Find cadidate nodes for the readable version of the article.

    Here's we're going to remove unlikely nodes, find scores on the rest, and
    clean up and return the final best match.

    """
    scorable_node_tags = SCORABLE_TAGS
    nodes_to_score = []
    should_remove = []

    for node in doc.iter():
        if is_unlikely_node(node):
            LOG.debug('We should drop unlikely: ' + str(node))
            should_remove.append(node)
            continue
        if node.tag == 'a' and is_bad_link(node):
            LOG.debug('We should drop bad link: ' + str(node))
            should_remove.append(node)
            continue
        if node.tag in scorable_node_tags and node not in nodes_to_score:
            nodes_to_score.append(node)
    return score_candidates(nodes_to_score), should_remove