def demo2(train_test, out='demo'): train_urls, test_url = download(train_test) train = pe.PageSequence([hp.url_to_page(url) for url in train_urls]) test = pe.PageSequence([hp.url_to_page(test_url)]) fit_result = pe.fit_model(train) for fr in fit_result: write_table(fr.items, '{0}-train-{1}.html'.format(out, fr.model.W)) write_table(pe.fit_result_extract(fr.model, fr.code_book, fr.extractors, test), '{0}-test-{1}.html'.format(out, fr.model.W)) annotate(fr, train, '{0}-train-annotated-{1}.html'.format(out, fr.model.W)) annotate(fr, test, '{0}-test-annotated-{1}.html'.format(out, fr.model.W)) print fr.model.W, fr.logP, pe.items_score(fr.items), fr.model.motif_entropy return train, fit_result
def do_ta(self, line): """ta <url> [--encoding ENCODING] - add template""" opts, (url,) = parse_at(line) t = url_to_page(url, opts.encoding) templates = self._load_templates() templates.append(t) self._save_templates(templates) print "[%d] %s" % (len(templates) - 1, t.url)
def do_ta(self, line): """ta <url> [--encoding ENCODING] - add template""" opts, (url, ) = parse_at(line) t = url_to_page(url, opts.encoding) templates = self._load_templates() templates.append(t) self._save_templates(templates) print "[%d] %s" % (len(templates) - 1, t.url)
def test_load_page_from_url(self): filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0') url = 'file://{}.{}'.format(filepath, 'html') page = url_to_page(url) parsed = json.load(open('{}.{}'.format(filepath, 'json'))) parsed = [_decode_element(d) for d in parsed] self.assertEqual(page.url, url) self._test_sample(page.body, parsed, 1)
def test_load_page_from_url(self): filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0') url = 'file://{}.{}'.format(filepath, 'html') page = url_to_page(url) parsed = json.load(open('{}.{}'.format(filepath, 'json'))) parsed = [_decode_element(d) for d in parsed] self.assertEqual(page.url, url) self._test_sample(page.body, parsed, 1)
def do_s(self, url): """s <url> - scrape url""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return # fall back to the template encoding if none is specified page = url_to_page(url, default_encoding=templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0])
def demo2(): page = hp.url_to_page('https://news.ycombinator.com/') meme = MEME() s = HTMLSequence(page) meme.fit(s, 4, 50) for x in meme.find_motif(s, 0): print 80*'-' print page.body[ s.tags[x.index].start:s.tags[x.index + meme.motif_width(0)].end]
def do_scrape(self, url): """scrape <url> - scrape url (alias: s)""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return # fall back to the template encoding if none is specified page = url_to_page(url, default_encoding=templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0])
def do_ta(self, line): """ta <url> [--encoding ENCODING --useragent 'User-Agent'] - add template""" opts, (url,) = parse_at_s(line) headers = { 'User-Agent' : opts.useragent or self.user_agent } url = urllib2.Request(url, headers=headers) t = url_to_page(url, opts.encoding) templates = self._load_templates() templates.append(t) self._save_templates(templates) print "[%d] %s" % (len(templates) - 1, t.url)
def do_add_template(self, line): """add_template <url> [--encoding ENCODING] - (alias: ta)""" if not line: print("You must provide an URL") print(IblTool.do_add_template.__doc__) return opts, (url, ) = parse_at(line) t = url_to_page(self.fix_url(url), opts.encoding) templates = self._load_templates() templates.append(t) self._save_templates(templates) print("[%d] %s" % (len(templates) - 1, t.url))
def do_add_template(self, line): """add_template <url> [--encoding ENCODING] - (alias: ta)""" if not line: print("You must provide an URL") print(IblTool.do_add_template.__doc__) return opts, (url,) = parse_at(line) t = url_to_page(self.fix_url(url), opts.encoding) templates = self._load_templates() templates.append(t) self._save_templates(templates) print("[%d] %s" % (len(templates) - 1, t.url))
def do_s(self, line): """s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return opts, (url,) = parse_at_s(line) headers = { 'User-Agent' : opts.useragent or self.user_agent } url = urllib2.Request(url, headers=headers) # fall back to the template encoding if none is specified page = url_to_page(url, opts.encoding, templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0])
def annotate(url, site_id, items): t = url_to_page(url) tms = [TemplateMaker(t)] for n, s in items: func = best_match(s) sel = tms[-1].select(func) print 'ATTRIBUTE: %s' % n for i in sel: print u'[%d] %s' % (i, tms[-1].selected_data(i)) if len(sel) == 1: row = sel[0] else: row = raw_input('? ') try: row = int(row) except ValueError: row = sel[0] #row = int(raw_input('? ')) #rows.pop(0) print 'SELECTED: %d' % row print '' annotated = False for tm in tms: try: if tm.annotate_fragment(row, n): annotated = True break except FragmentAlreadyAnnotated: pass if not annotated: tms.append(TemplateMaker(t)) tms[-1].annotate_fragment(row, n) save_templates('scraper.json', site_id, (tm.get_template() for tm in tms)) return [tm.get_template() for tm in tms]
def annotate(url, site_id, items): t = url_to_page(url) tms = [TemplateMaker(t)] for n, s in items: func = best_match(s) sel = tms[-1].select(func) print 'ATTRIBUTE: %s' % n for i in sel: print u'[%d] %s' % (i, tms[-1].selected_data(i)) if len(sel) == 1: row = sel[0] else: row = raw_input('? ') try: row = int(row) except ValueError: row = sel[0] #row = int(raw_input('? ')) #rows.pop(0) print 'SELECTED: %d' % row print '' annotated = False for tm in tms: try: if tm.annotate_fragment(row, n): annotated = True break except FragmentAlreadyAnnotated: pass if not annotated: tms.append(TemplateMaker(t)) tms[-1].annotate_fragment(row, n) save_templates('scraper.json', site_id, (tm.get_template() for tm in tms)) return [tm.get_template() for tm in tms]
def scrape(self, url, encoding=None): page = url_to_page(url, encoding) return self.scrape_page(page)
def train(self, url, data, encoding=None): page = url_to_page(url, encoding) self.train_from_htmlpage(page, data)
'http://www.rc-chem.eu/produkty/3-mmc-crystal', 'http://www.rc-chem.eu/produkty/4-fa-crystal', 'http://www.rc-chem.eu/produkty/dimethylone', 'http://www.rc-chem.eu/produkty/ethylphenidate', 'http://www.rc-chem.eu/produkty/mpa', 'http://www.rc-chem.eu/produkty/neb', 'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal', 'http://www.rc-chem.eu/produkty/thio-crystal', 'http://www.rc-chem.eu/produkty/thio-velky-crystal', 'http://mefedronprodej.webnode.cz/produkty-1/', ) for u in urls: #('file:///home/pborky/tmp/test/test1.html', 'file:///home/pborky/tmp/test/test2.html', 'file:///home/pborky/tmp/test/test3.html', 'file:///home/pborky/tmp/test/test4.html'): page = url_to_page(u) extract = {} e = ex.extract(page) if e[0] is None: print 'FAILED to extract from %s.' %u continue for ee in e[0]: extract.update(ee) values = { 'URL_PROD': u, } for k,v in extract.iteritems(): values[k] = v[0] name = values.get('NAME_PROD') chem = values.get('CHEM_NAME_PROD')
'http://www.rc-chem.eu/produkty/2-fma', 'http://www.rc-chem.eu/produkty/3-fmc', 'http://www.rc-chem.eu/produkty/3-mmc-crystal', 'http://www.rc-chem.eu/produkty/4-fa-crystal', 'http://www.rc-chem.eu/produkty/dimethylone', 'http://www.rc-chem.eu/produkty/ethylphenidate', 'http://www.rc-chem.eu/produkty/mpa', 'http://www.rc-chem.eu/produkty/neb', 'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal', 'http://www.rc-chem.eu/produkty/thio-crystal', 'http://www.rc-chem.eu/produkty/thio-velky-crystal', 'http://mefedronprodej.webnode.cz/produkty-1/', ) for u in urls: #('file:///home/pborky/tmp/test/test1.html', 'file:///home/pborky/tmp/test/test2.html', 'file:///home/pborky/tmp/test/test3.html', 'file:///home/pborky/tmp/test/test4.html'): page = url_to_page(u) extract = {} e = ex.extract(page) if e[0] is None: print 'FAILED to extract from %s.' % u continue for ee in e[0]: extract.update(ee) values = { 'URL_PROD': u, } for k, v in extract.iteritems(): values[k] = v[0] name = values.get('NAME_PROD') chem = values.get('CHEM_NAME_PROD')
def scrape(self, url, encoding=None): page = url_to_page(url, encoding) return self.scrape_page(page)
def train(self, url, data, encoding=None): page = url_to_page(url, encoding) self.train_from_htmlpage(page, data)
write(u'{0:3d}|{1}'.format( label, cgi.escape(page.body[fragment.start:fragment.end].strip()))) out.write('</span>\n') out.write(""" </pre> </body> </html>""") if __name__ == '__main__': url = sys.argv[1] print 'Downloading URL...', t1 = time.clock() page = hp.url_to_page(url) print 'done ({0}s)'.format(time.clock() - t1) print 'Extracting items...', t1 = time.clock() ie = aile.kernel.ItemExtract(aile.ptree.PageTree(page)) print 'done ({0}s)'.format(time.clock() - t1) print 'Annotating HTML' labels = np.repeat(-1, len(ie.page_tree.page.parsed_body)) items, cells = ie.table_fragments[0] for i in range(cells.shape[0]): for j in range(cells.shape[1]): labels[cells[i, j]] = j annotate(ie.page_tree.page, labels)
for i, p in enumerate(ptree.parents): if p > 0: T[p].add_child(T[i]) else: root.add_child(T[i]) cmap = color_map(max(labels) + 2) for t, l in zip(T, labels): ns = ete2.NodeStyle() ns['bgcolor'] = cmap[l] t.set_style(ns) if not t.is_leaf(): t.add_face(ete2.TextFace(t.name), column=0, position='branch-top') root.show() if __name__ == '__main__': url = sys.argv[1] print 'Downloading URL...', t1 = time.clock() page = hp.url_to_page(url) print 'done ({0}s)'.format(time.clock() - t1) print 'Extracting items...', t1 = time.clock() ie = aile.kernel.ItemExtract(aile.ptree.PageTree(page)) print 'done ({0}s)'.format(time.clock() - t1) print 'Drawing HTML tree' draw_tree(ie.page_tree, ie.labels)