Beispiel #1
0
    def test_bonus_score_per_100_chars_in_p(self):
        """Nodes get 1pt per 100 characters up to 3 max points"""
        def build_doc(length):
            div = '<div id="content" class=""><p>{0}</p></div>'
            document_str = '<html><body>{0}</body></html>'
            content = 'c' * length
            test_div = div.format(content)
            doc = document_fromstring(document_str.format(test_div))
            test_nodes = []
            for node in doc.getiterator():
                if node.tag == 'p':
                    test_nodes.append(node)
            return test_nodes

        test_nodes = build_doc(400)
        candidates = score_candidates(test_nodes)
        pscore_400 = max([c.content_score for c in candidates.values()])

        test_nodes = build_doc(100)
        candidates = score_candidates(test_nodes)
        pscore_100 = max([c.content_score for c in candidates.values()])

        test_nodes = build_doc(50)
        candidates = score_candidates(test_nodes)
        pscore_50 = max([c.content_score for c in candidates.values()])

        self.assertEqual(pscore_100, pscore_50 + 1)
        self.assertEqual(pscore_400, pscore_50 + 3)
Beispiel #2
0
def test_bonus_score_per_100_chars_in_p():
    """Nodes get 1 point per 100 characters up to max. 3 points."""
    def build_candidates(length):
        html = "<p>%s</p>" % ("c" * length)
        node = fragment_fromstring(html)

        return [node]

    test_nodes = build_candidates(50)
    candidates = score_candidates(test_nodes)
    pscore_50 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(100)
    candidates = score_candidates(test_nodes)
    pscore_100 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(300)
    candidates = score_candidates(test_nodes)
    pscore_300 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(400)
    candidates = score_candidates(test_nodes)
    pscore_400 = max(c.content_score for c in candidates.values())

    assert pscore_50 + 0.5 == pscore_100
    assert pscore_100 + 2.0 == pscore_300
    assert pscore_300 == pscore_400
Beispiel #3
0
def test_bonus_score_per_100_chars_in_p():
    """Nodes get 1 point per 100 characters up to max. 3 points."""
    def build_candidates(length):
        html = "<p>%s</p>" % ("c" * length)
        node = fragment_fromstring(html)

        return [node]

    test_nodes = build_candidates(50)
    candidates = score_candidates(test_nodes)
    pscore_50 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(100)
    candidates = score_candidates(test_nodes)
    pscore_100 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(300)
    candidates = score_candidates(test_nodes)
    pscore_300 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(400)
    candidates = score_candidates(test_nodes)
    pscore_400 = max(c.content_score for c in candidates.values())

    assert pscore_50 + 0.5 == pscore_100
    assert pscore_100 + 2.0 == pscore_300
    assert pscore_300 == pscore_400
Beispiel #4
0
    def test_bonus_score_per_100_chars_in_p(self):
        """Nodes get 1pt per 100 characters up to 3 max points"""
        def build_doc(length):
            div = '<div id="content" class=""><p>{0}</p></div>'
            document_str = '<html><body>{0}</body></html>'
            content = 'c' * length
            test_div = div.format(content)
            doc = document_fromstring(document_str.format(test_div))
            test_nodes = []
            for node in doc.getiterator():
                if node.tag == 'p':
                    test_nodes.append(node)
            return test_nodes

        test_nodes = build_doc(400)
        candidates = score_candidates(test_nodes)
        pscore_400 = max([c.content_score for c in candidates.values()])

        test_nodes = build_doc(100)
        candidates = score_candidates(test_nodes)
        pscore_100 = max([c.content_score for c in candidates.values()])

        test_nodes = build_doc(50)
        candidates = score_candidates(test_nodes)
        pscore_50 = max([c.content_score for c in candidates.values()])

        self.assertEqual(pscore_100, pscore_50 + 1)
        self.assertEqual(pscore_400, pscore_50 + 3)
Beispiel #5
0
def test_we_get_candidates():
    """Processing candidates should get us a list of nodes to try out."""
    doc = document_fromstring(load_article("ars.001.html"))
    test_nodes = tuple(doc.iter("p", "td", "pre"))
    candidates = score_candidates(test_nodes)

    # this might change as we tweak our algorithm, but if it does,
    # it signifies we need to look at what we changed.
    assert len(candidates.keys()) == 37

    # one of these should have a decent score
    scores = sorted(c.content_score for c in candidates.values())
    assert scores[-1] > 100
Beispiel #6
0
def test_we_get_candidates():
    """Processing candidates should get us a list of nodes to try out."""
    doc = document_fromstring(load_article("ars.001.html"))
    test_nodes = tuple(doc.iter("p", "td", "pre"))
    candidates = score_candidates(test_nodes)

    # this might change as we tweak our algorithm, but if it does,
    # it signifies we need to look at what we changed.
    assert len(candidates.keys()) == 37

    # one of these should have a decent score
    scores = sorted(c.content_score for c in candidates.values())
    assert scores[-1] > 100
Beispiel #7
0
    def test_we_get_candidates(self):
        """Processing candidates should get us a list of nodes to try out."""
        # we'll start out using our first real test document
        test_nodes = []
        doc = document_fromstring(load_article('ars/ars.001.html'))
        for node in doc.getiterator():
            if node.tag in ['p', 'td', 'pre']:
                test_nodes.append(node)

        candidates = score_candidates(test_nodes)

        # this might change as we tweak our algorithm, but if it does change,
        # it signifies we need to look at what we changed.
        self.assertEqual(len(candidates.keys()), 6)

        # one of these should have a decent score
        scores = sorted([c.content_score for c in candidates.values()])
        self.assertTrue(scores[-1] > 100)
Beispiel #8
0
    def test_we_get_candidates(self):
        """Processing candidates should get us a list of nodes to try out."""
        # we'll start out using our first real test document
        test_nodes = []
        doc = document_fromstring(load_article('ars/ars.001.html'))
        for node in doc.getiterator():
            if node.tag in ['p', 'td', 'pre']:
                test_nodes.append(node)

        candidates = score_candidates(test_nodes)

        # this might change as we tweak our algorithm, but if it does change,
        # it signifies we need to look at what we changed.
        self.assertEqual(len(candidates.keys()), 6)

        # one of these should have a decent score
        scores = sorted([c.content_score for c in candidates.values()])
        self.assertTrue(scores[-1] > 100)