def test_bonus_score_per_100_chars_in_p(self): """Nodes get 1pt per 100 characters up to 3 max points""" def build_doc(length): div = '<div id="content" class=""><p>{0}</p></div>' document_str = '<html><body>{0}</body></html>' content = 'c' * length test_div = div.format(content) doc = document_fromstring(document_str.format(test_div)) test_nodes = [] for node in doc.getiterator(): if node.tag == 'p': test_nodes.append(node) return test_nodes test_nodes = build_doc(400) candidates = score_candidates(test_nodes) pscore_400 = max([c.content_score for c in candidates.values()]) test_nodes = build_doc(100) candidates = score_candidates(test_nodes) pscore_100 = max([c.content_score for c in candidates.values()]) test_nodes = build_doc(50) candidates = score_candidates(test_nodes) pscore_50 = max([c.content_score for c in candidates.values()]) self.assertEqual(pscore_100, pscore_50 + 1) self.assertEqual(pscore_400, pscore_50 + 3)
def test_bonus_score_per_100_chars_in_p(): """Nodes get 1 point per 100 characters up to max. 3 points.""" def build_candidates(length): html = "<p>%s</p>" % ("c" * length) node = fragment_fromstring(html) return [node] test_nodes = build_candidates(50) candidates = score_candidates(test_nodes) pscore_50 = max(c.content_score for c in candidates.values()) test_nodes = build_candidates(100) candidates = score_candidates(test_nodes) pscore_100 = max(c.content_score for c in candidates.values()) test_nodes = build_candidates(300) candidates = score_candidates(test_nodes) pscore_300 = max(c.content_score for c in candidates.values()) test_nodes = build_candidates(400) candidates = score_candidates(test_nodes) pscore_400 = max(c.content_score for c in candidates.values()) assert pscore_50 + 0.5 == pscore_100 assert pscore_100 + 2.0 == pscore_300 assert pscore_300 == pscore_400
def test_we_get_candidates(): """Processing candidates should get us a list of nodes to try out.""" doc = document_fromstring(load_article("ars.001.html")) test_nodes = tuple(doc.iter("p", "td", "pre")) candidates = score_candidates(test_nodes) # this might change as we tweak our algorithm, but if it does, # it signifies we need to look at what we changed. assert len(candidates.keys()) == 37 # one of these should have a decent score scores = sorted(c.content_score for c in candidates.values()) assert scores[-1] > 100
def test_we_get_candidates(self): """Processing candidates should get us a list of nodes to try out.""" # we'll start out using our first real test document test_nodes = [] doc = document_fromstring(load_article('ars/ars.001.html')) for node in doc.getiterator(): if node.tag in ['p', 'td', 'pre']: test_nodes.append(node) candidates = score_candidates(test_nodes) # this might change as we tweak our algorithm, but if it does change, # it signifies we need to look at what we changed. self.assertEqual(len(candidates.keys()), 6) # one of these should have a decent score scores = sorted([c.content_score for c in candidates.values()]) self.assertTrue(scores[-1] > 100)