def test_full_align(self): ra = RecordAligner() records = [Record(t1), Record(t2), Record(t3)] seed, mappings = ra.align(records) self.assertEqual(3, len(mappings)) # all the elements from seed should matched to other 2 trees for tag in ['root', 'a', 'b', 'c']: e = seed[0].xpath('//%s' % tag)[0] expected = [] for record, mapping in mappings.items(): expected.append(mapping[e].tag) self.assertEqual([tag] * 3, expected)
def test_extract_with_seed2(self): mdr = MDR() page1 = get_page('htmlpage1') candidates, doc = mdr.list_candidates(page1, 'utf8') seed_record = Record(candidates[0][1], candidates[0][2]) fragment1 = fragment_fromstring(get_page('fragment1')) seed_record_copy, mappings = mdr.extract(fragment1, seed_record) self.assertEquals(2, len(seed_record_copy)) self.assertEquals('hreview', seed_record_copy[1].attrib.get('class')) # 27 items (records) self.assertEquals(27, len(mappings)) extracted_dates = [] extracted_texts = [] for record, mapping in mappings.items(): for k, v in mapping.items(): if k.attrib.get('class') == 'dtreviewed': extracted_dates.append(v.text) elif k.attrib.get('class') == 'description': extracted_texts.append(v.text) # extract items are sorted in original order self.assertEquals(extracted_dates[0], '27-05-2014') self.assertEquals(extracted_dates[-1], '07-07-2013') self.assertEquals(extracted_texts[0], 'Kwaliteit van het eten matig') self.assertEquals( extracted_texts[-1], 'Paviljoen Strand 90 te Domburg is een uiterst sfeervol restaurant. De inrichting is smaakvol met mooie kleuren. De bediening is vriendelijk en behulpzaam. Het eten was lekker. Kortom, we zullen er zeker terug komen.' )
def test_extract_with_seed(self): mdr = MDR() page = get_page('htmlpage0') candidates, doc = mdr.list_candidates(page, 'utf8') # we known first element can be used as seed seed_record = Record(candidates[0][0]) fragment = fragment_fromstring(get_page('fragment0')) seed_record_copy, mappings = mdr.extract(fragment, seed_record) # record only have 1 <li> elememt self.assertEquals(1, len(seed_record_copy)) # 40 items (records) self.assertEquals(40, len(mappings)) extracted_dates = [] for record, mapping in mappings.items(): for k, v in mapping.items(): if k.attrib.get('itemprop') == 'datePublished': extracted_dates.append(v.attrib.get('content')) self.assertEquals(extracted_dates[0], '2014-07-02') self.assertEquals(extracted_dates[-1], '2014-05-18')
def test_align_with_record(self): ra = RecordAligner() seed_record = Record(t4) records = [Record(t1), Record(t2), Record(t3)] seed, mappings = ra.align(records, seed_record) self.assertEqual(4, len(mappings)) # all the elements from seed should matched to other 3 trees for tag in ['root', 'a', 'b', 'c']: root = seed[0].xpath('//%s' % tag)[0] expected = [] for record, mapping in mappings.items(): if seed_record == record: continue expected.append(mapping[root].tag) self.assertEqual([tag] * 3, expected)