Beispiel #1
0
def demo2(train_test, out='demo'):
    train_urls, test_url = download(train_test)
    train = pe.PageSequence([hp.url_to_page(url) for url in train_urls])
    test = pe.PageSequence([hp.url_to_page(test_url)])
    fit_result = pe.fit_model(train)
    for fr in fit_result:
        write_table(fr.items, '{0}-train-{1}.html'.format(out, fr.model.W))
        write_table(pe.fit_result_extract(fr.model, fr.code_book, fr.extractors, test),
                    '{0}-test-{1}.html'.format(out, fr.model.W))
        annotate(fr, train, '{0}-train-annotated-{1}.html'.format(out, fr.model.W))
        annotate(fr, test, '{0}-test-annotated-{1}.html'.format(out, fr.model.W))
        print fr.model.W, fr.logP, pe.items_score(fr.items), fr.model.motif_entropy

    return train, fit_result
Beispiel #2
0
 def do_ta(self, line):
     """ta <url> [--encoding ENCODING] - add template"""
     opts, (url,) = parse_at(line)
     t = url_to_page(url, opts.encoding)
     templates = self._load_templates()
     templates.append(t)
     self._save_templates(templates)
     print "[%d] %s" % (len(templates) - 1, t.url)
Beispiel #3
0
 def do_ta(self, line):
     """ta <url> [--encoding ENCODING] - add template"""
     opts, (url, ) = parse_at(line)
     t = url_to_page(url, opts.encoding)
     templates = self._load_templates()
     templates.append(t)
     self._save_templates(templates)
     print "[%d] %s" % (len(templates) - 1, t.url)
Beispiel #4
0
 def test_load_page_from_url(self):
     filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0')
     url = 'file://{}.{}'.format(filepath, 'html')
     page = url_to_page(url)
     parsed = json.load(open('{}.{}'.format(filepath, 'json')))
     parsed = [_decode_element(d) for d in parsed]
     self.assertEqual(page.url, url)
     self._test_sample(page.body, parsed, 1)
Beispiel #5
0
 def test_load_page_from_url(self):
     filepath = os.path.join(BASE_PATH, 'samples/samples_htmlpage_0')
     url = 'file://{}.{}'.format(filepath, 'html')
     page = url_to_page(url)
     parsed = json.load(open('{}.{}'.format(filepath, 'json')))
     parsed = [_decode_element(d) for d in parsed]
     self.assertEqual(page.url, url)
     self._test_sample(page.body, parsed, 1)
Beispiel #6
0
 def do_s(self, url):
     """s <url> - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     # fall back to the template encoding if none is specified
     page = url_to_page(url, default_encoding=templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
Beispiel #7
0
def demo2():
    page = hp.url_to_page('https://news.ycombinator.com/')
    meme = MEME()
    s = HTMLSequence(page)
    meme.fit(s, 4, 50)
    for x in meme.find_motif(s, 0):
        print 80*'-'
        print page.body[
            s.tags[x.index].start:s.tags[x.index + meme.motif_width(0)].end]
Beispiel #8
0
 def do_scrape(self, url):
     """scrape <url> - scrape url (alias: s)"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     # fall back to the template encoding if none is specified
     page = url_to_page(url, default_encoding=templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
Beispiel #9
0
 def do_ta(self, line):
     """ta <url> [--encoding ENCODING --useragent 'User-Agent'] - add template"""
     opts, (url,) = parse_at_s(line)
     headers = { 'User-Agent' : opts.useragent or self.user_agent }
     url = urllib2.Request(url, headers=headers)
     t = url_to_page(url, opts.encoding)
     templates = self._load_templates()
     templates.append(t)
     self._save_templates(templates)
     print "[%d] %s" % (len(templates) - 1, t.url)
Beispiel #10
0
 def do_add_template(self, line):
     """add_template <url> [--encoding ENCODING] - (alias: ta)"""
     if not line:
         print("You must provide an URL")
         print(IblTool.do_add_template.__doc__)
         return
     opts, (url, ) = parse_at(line)
     t = url_to_page(self.fix_url(url), opts.encoding)
     templates = self._load_templates()
     templates.append(t)
     self._save_templates(templates)
     print("[%d] %s" % (len(templates) - 1, t.url))
Beispiel #11
0
 def do_add_template(self, line):
     """add_template <url> [--encoding ENCODING] - (alias: ta)"""
     if not line:
         print("You must provide an URL")
         print(IblTool.do_add_template.__doc__)
         return
     opts, (url,) = parse_at(line)
     t = url_to_page(self.fix_url(url), opts.encoding)
     templates = self._load_templates()
     templates.append(t)
     self._save_templates(templates)
     print("[%d] %s" % (len(templates) - 1, t.url))
Beispiel #12
0
 def do_s(self, line):
     """s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     opts, (url,) = parse_at_s(line)
     headers = { 'User-Agent' : opts.useragent or self.user_agent }
     url = urllib2.Request(url, headers=headers)
     # fall back to the template encoding if none is specified
     page = url_to_page(url, opts.encoding, templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
Beispiel #13
0
def annotate(url, site_id, items):
    t = url_to_page(url)
    tms = [TemplateMaker(t)]

    for n, s in items:

        func = best_match(s)
        sel = tms[-1].select(func)
        print 'ATTRIBUTE: %s' % n
        for i in sel:
            print u'[%d] %s' % (i, tms[-1].selected_data(i))
        if len(sel) == 1:
            row = sel[0]
        else:
            row = raw_input('? ')
            try:
                row = int(row)
            except ValueError:
                row = sel[0]
                #row = int(raw_input('? ')) #rows.pop(0)
        print 'SELECTED: %d' % row
        print ''
        annotated = False
        for tm in tms:
            try:
                if tm.annotate_fragment(row, n):
                    annotated = True
                    break
            except FragmentAlreadyAnnotated:
                pass
        if not annotated:
            tms.append(TemplateMaker(t))
            tms[-1].annotate_fragment(row, n)

    save_templates('scraper.json', site_id, (tm.get_template() for tm in tms))
    return [tm.get_template() for tm in tms]
Beispiel #14
0
def annotate(url, site_id, items):
    t = url_to_page(url)
    tms = [TemplateMaker(t)]

    for n, s in items:

        func = best_match(s)
        sel = tms[-1].select(func)
        print 'ATTRIBUTE: %s' % n
        for i in sel:
            print u'[%d] %s' % (i, tms[-1].selected_data(i))
        if len(sel) == 1:
            row = sel[0]
        else:
            row = raw_input('? ')
            try:
                row = int(row)
            except ValueError:
                row = sel[0]
                #row = int(raw_input('? ')) #rows.pop(0)
        print 'SELECTED: %d' % row
        print ''
        annotated = False
        for tm in tms:
            try:
                if tm.annotate_fragment(row, n):
                    annotated = True
                    break
            except FragmentAlreadyAnnotated:
                pass
        if not annotated:
            tms.append(TemplateMaker(t))
            tms[-1].annotate_fragment(row, n)

    save_templates('scraper.json', site_id, (tm.get_template() for tm in tms))
    return [tm.get_template() for tm in tms]
Beispiel #15
0
 def scrape(self, url, encoding=None):
     page = url_to_page(url, encoding)
     return self.scrape_page(page)
Beispiel #16
0
 def train(self, url, data, encoding=None):
     page = url_to_page(url, encoding)
     self.train_from_htmlpage(page, data)
Beispiel #17
0
    'http://www.rc-chem.eu/produkty/3-mmc-crystal',
    'http://www.rc-chem.eu/produkty/4-fa-crystal',
    'http://www.rc-chem.eu/produkty/dimethylone',
    'http://www.rc-chem.eu/produkty/ethylphenidate',
    'http://www.rc-chem.eu/produkty/mpa',
    'http://www.rc-chem.eu/produkty/neb',
    'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal',
    'http://www.rc-chem.eu/produkty/thio-crystal',
    'http://www.rc-chem.eu/produkty/thio-velky-crystal',
    'http://mefedronprodej.webnode.cz/produkty-1/',
)



for u in urls: #('file:///home/pborky/tmp/test/test1.html', 'file:///home/pborky/tmp/test/test2.html', 'file:///home/pborky/tmp/test/test3.html', 'file:///home/pborky/tmp/test/test4.html'):
    page = url_to_page(u)
    extract = {}
    e = ex.extract(page)
    if e[0] is None:
        print 'FAILED to extract from %s.' %u
        continue
    for ee in e[0]:
        extract.update(ee)
    values = {
        'URL_PROD': u,
    }
    for k,v in extract.iteritems():
        values[k] = v[0]

    name = values.get('NAME_PROD')
    chem = values.get('CHEM_NAME_PROD')
Beispiel #18
0
    'http://www.rc-chem.eu/produkty/2-fma',
    'http://www.rc-chem.eu/produkty/3-fmc',
    'http://www.rc-chem.eu/produkty/3-mmc-crystal',
    'http://www.rc-chem.eu/produkty/4-fa-crystal',
    'http://www.rc-chem.eu/produkty/dimethylone',
    'http://www.rc-chem.eu/produkty/ethylphenidate',
    'http://www.rc-chem.eu/produkty/mpa',
    'http://www.rc-chem.eu/produkty/neb',
    'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal',
    'http://www.rc-chem.eu/produkty/thio-crystal',
    'http://www.rc-chem.eu/produkty/thio-velky-crystal',
    'http://mefedronprodej.webnode.cz/produkty-1/',
)

for u in urls:  #('file:///home/pborky/tmp/test/test1.html', 'file:///home/pborky/tmp/test/test2.html', 'file:///home/pborky/tmp/test/test3.html', 'file:///home/pborky/tmp/test/test4.html'):
    page = url_to_page(u)
    extract = {}
    e = ex.extract(page)
    if e[0] is None:
        print 'FAILED to extract from %s.' % u
        continue
    for ee in e[0]:
        extract.update(ee)
    values = {
        'URL_PROD': u,
    }
    for k, v in extract.iteritems():
        values[k] = v[0]

    name = values.get('NAME_PROD')
    chem = values.get('CHEM_NAME_PROD')
Beispiel #19
0
 def scrape(self, url, encoding=None):
     page = url_to_page(url, encoding)
     return self.scrape_page(page)
Beispiel #20
0
 def train(self, url, data, encoding=None):
     page = url_to_page(url, encoding)
     self.train_from_htmlpage(page, data)
Beispiel #21
0
Datei: demo1.py Projekt: 01-/aile
                write(u'{0:3d}|{1}'.format(
                    label,
                    cgi.escape(page.body[fragment.start:fragment.end].strip())))
            out.write('</span>\n')
        out.write("""
</pre>
</body>
</html>""")


if __name__ == '__main__':
    url = sys.argv[1]

    print 'Downloading URL...',
    t1 = time.clock()
    page = hp.url_to_page(url)
    print 'done ({0}s)'.format(time.clock() - t1)

    print 'Extracting items...',
    t1 = time.clock()
    ie = aile.kernel.ItemExtract(aile.ptree.PageTree(page))
    print 'done ({0}s)'.format(time.clock() - t1)

    print 'Annotating HTML'
    labels = np.repeat(-1, len(ie.page_tree.page.parsed_body))
    items, cells = ie.table_fragments[0]
    for i in range(cells.shape[0]):
        for j in range(cells.shape[1]):
            labels[cells[i, j]] = j
    annotate(ie.page_tree.page, labels)
Beispiel #22
0
    for i, p in enumerate(ptree.parents):
        if p > 0:
            T[p].add_child(T[i])
        else:
            root.add_child(T[i])
    cmap = color_map(max(labels) + 2)
    for t, l in zip(T, labels):
        ns = ete2.NodeStyle()
        ns['bgcolor'] = cmap[l]
        t.set_style(ns)
        if not t.is_leaf():
            t.add_face(ete2.TextFace(t.name), column=0, position='branch-top')
    root.show()


if __name__ == '__main__':
    url = sys.argv[1]

    print 'Downloading URL...',
    t1 = time.clock()
    page = hp.url_to_page(url)
    print 'done ({0}s)'.format(time.clock() - t1)

    print 'Extracting items...',
    t1 = time.clock()
    ie = aile.kernel.ItemExtract(aile.ptree.PageTree(page))
    print 'done ({0}s)'.format(time.clock() - t1)

    print 'Drawing HTML tree'
    draw_tree(ie.page_tree, ie.labels)