def test_selected_data(self): tm = TemplateMaker(self.PAGE) indexes = tm.select(best_match('text to annotate')) data = [tm.selected_data(i) for i in indexes] self.assertEqual(data, \ [u'<p>Some text to annotate here</p>', \ u'<p>Another text to annotate there</p>'])
def test_annotations(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate'), best_match=False) annotations = [x[0] for x in tm.annotations()] self.assertEqual(annotations, [{u'annotations': {u'content': u'field1'}}, {u'annotations': {u'content': u'field1'}}])
def test_annotate_multiple(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate'), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([(tpl, None)]) self.assertEqual(ex.extract(self.PAGE)[0], [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
def test_annotate_ignore_unpaired(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match("and that's"), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([(tpl, None)]) self.assertEqual(ex.extract(self.PAGE)[0], [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
def test_annotations(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate"), best_match=False) annotations = [x[0] for x in tm.annotations()] self.assertEqual( annotations, [{u"annotations": {u"content": u"field1"}}, {u"annotations": {u"content": u"field1"}}] )
def test_annotate_multiple(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate"), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([tpl]) self.assertEqual( ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}] )
def do_t(self, line): """ts <template> <text> - test selection text""" template_id, criteria = line.split(' ', 1) t = self._load_template(template_id) criteria = parse_criteria(criteria) tm = TemplateMaker(t) selection = apply_criteria(criteria, tm) for n, i in enumerate(selection): print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
def do_al(self, template_id): """al <template> - list annotations""" if assert_or_print(template_id, "missing template id"): return t = self._load_template(template_id) tm = TemplateMaker(t) for n, (a, i) in enumerate(tm.annotations()): print "[%s-%d] (%s) %r" % (template_id, n, a['annotations']['content'], remove_annotation(tm.selected_data(i)))
def do_t(self, line): """t <template> <text> - test selection text""" template_id, criteria = line.split(' ', 1) t = self._load_template(template_id) criteria = self._parse_criteria(criteria) tm = TemplateMaker(t) selection = apply_criteria(criteria, tm) for n, i in enumerate(selection): print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
def train_from_htmlpage(self, htmlpage, data): assert data, "Cannot train with empty data" tm = TemplateMaker(htmlpage) for field, values in data.items(): if (isinstance(values, (bytes, str)) or not hasattr(values, '__iter__')): values = [values] for value in values: value = str_to_unicode(value, htmlpage.encoding) tm.annotate(field, best_match(value)) self.add_template(tm.get_template())
def train_from_htmlpage(self, htmlpage, data): assert data, "Cannot train with empty data" tm = TemplateMaker(htmlpage) for field, values in data.items(): if not hasattr(values, '__iter__'): values = [values] for value in values: if isinstance(value, str): value = value.decode(htmlpage.encoding or 'utf-8') tm.annotate(field, best_match(value)) self.add_template(tm.get_template())
def train(self, url=None, data=None, html=None, encoding='utf-8'): assert data, "Cannot train with empty data" page = self._get_page(url, encoding, html) tm = TemplateMaker(page) for field, values in data.items(): if not hasattr(values, '__iter__'): values = [values] for value in values: if isinstance(value, str): value = value.decode(encoding) tm.annotate(field, best_match(value)) self.templates.append(tm.get_template())
def do_annotate(self, line): """annotate <template_id> <data> [-n number] [-f field]- add or test annotation (aliases: a, t) Add a new annotation (if -f is passed) or test what would be annotated otherwise """ if line.find(' ') < 0: print( "You must provide a valid template identifier (check output of ls_templates)" ) print(IblTool.do_annotate.__doc__) return template_id, criteria = line.split(' ', 1) t = self._load_template(template_id) if not t: return criteria = self._parse_criteria(criteria) tm = TemplateMaker(t) selection = apply_criteria(criteria, tm) if criteria.field: for index in selection: index = selection[0] tm.annotate_fragment(index, criteria.field) self._save_template(template_id, tm.get_template()) print("[new] (%s) %r" % (criteria.field, remove_annotation(tm.selected_data(index)))) else: for n, i in enumerate(selection): print("[%d] %r" % (n, remove_annotation(tm.selected_data(i))))
def do_annotate(self, line): """annotate <template_id> <data> [-n number] [-f field]- add or test annotation (aliases: a, t) Add a new annotation (if -f is passed) or test what would be annotated otherwise """ if line.find(' ') < 0: print("You must provide a valid template identifier (check output of ls_templates)") print(IblTool.do_annotate.__doc__) return template_id, criteria = line.split(' ', 1) t = self._load_template(template_id) if not t: return criteria = self._parse_criteria(criteria) tm = TemplateMaker(t) selection = apply_criteria(criteria, tm) if criteria.field: for index in selection: index = selection[0] tm.annotate_fragment(index, criteria.field) self._save_template(template_id, tm.get_template()) print("[new] (%s) %r" % (criteria.field, remove_annotation(tm.selected_data(index)))) else: for n, i in enumerate(selection): print("[%d] %r" % (n, remove_annotation(tm.selected_data(i))))
def annotate(url, site_id, items): t = url_to_page(url) tms = [TemplateMaker(t)] for n, s in items: func = best_match(s) sel = tms[-1].select(func) print 'ATTRIBUTE: %s' % n for i in sel: print u'[%d] %s' % (i, tms[-1].selected_data(i)) if len(sel) == 1: row = sel[0] else: row = raw_input('? ') try: row = int(row) except ValueError: row = sel[0] #row = int(raw_input('? ')) #rows.pop(0) print 'SELECTED: %d' % row print '' annotated = False for tm in tms: try: if tm.annotate_fragment(row, n): annotated = True break except FragmentAlreadyAnnotated: pass if not annotated: tms.append(TemplateMaker(t)) tms[-1].annotate_fragment(row, n) save_templates('scraper.json', site_id, (tm.get_template() for tm in tms)) return [tm.get_template() for tm in tms]
def do_a(self, line): """a <template> <data> [-n number] [-f field]- add or test annotation Add a new annotation (if -f is passed) or test what would be annotated otherwise """ template_id, criteria = line.split(' ', 1) t = self._load_template(template_id) criteria = self._parse_criteria(criteria) tm = TemplateMaker(t) selection = apply_criteria(criteria, tm) if criteria.field: for index in selection: index = selection[0] tm.annotate_fragment(index, criteria.field) self._save_template(template_id, tm.get_template()) print "[new] (%s) %r" % ( criteria.field, remove_annotation(tm.selected_data(index))) else: for n, i in enumerate(selection): print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
def do_a(self, line): """a <template> <data> [-n number] [-f field]- add or test annotation Add a new annotation (if -f is passed) or test what would be annotated otherwise """ template_id, criteria = line.split(' ', 1) t = self._load_template(template_id) criteria = parse_criteria(criteria) tm = TemplateMaker(t) selection = apply_criteria(criteria, tm) if criteria.field: for index in selection: index = selection[0] tm.annotate_fragment(index, criteria.field) self._save_template(template_id, tm.get_template()) print "[new] (%s) %r" % (criteria.field, remove_annotation(tm.selected_data(index))) else: for n, i in enumerate(selection): print "[%d] %r" % (n, remove_annotation(tm.selected_data(i)))
def test_annotate_fragment_not_found(self): tm = TemplateMaker(self.PAGE) self.assertRaises(FragmentNotFound, tm.annotate, 'field1', best_match("missing text"))
def _load_annotations(self, template_id): t = self._load_template(template_id) tm = TemplateMaker(t) return [x[0] for x in tm.annotations()]
def _load_annotations(self, template_id): t = self._load_template(template_id) if not t: return tm = TemplateMaker(t) return [x[0] for x in tm.annotations()]
def test_annotate_fragment_already_annotated(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate')) self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, 'field1', best_match("text to annotate"))
def test_selected_data(self): tm = TemplateMaker(self.PAGE) indexes = tm.select(best_match("text to annotate")) data = [tm.selected_data(i) for i in indexes] self.assertEqual(data, [u"<p>Some text to annotate here</p>", u"<p>Another text to annotate there</p>"])
def test_annotate_fragment_already_annotated(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate")) self.assertRaises(FragmentAlreadyAnnotated, tm.annotate, "field1", best_match("text to annotate"))