Exemple #1
0
    def test_replace_wrong_encoding(self):
        """Test invalid chars are replaced properly"""
        encoding, body_unicode = html_to_unicode(ct('utf-8'),
                'PREFIX\xe3\xabSUFFIX')
        # XXX: Policy for replacing invalid chars may suffer minor variations
        # but it should always contain the unicode replacement char (u'\ufffd')
        assert u'\ufffd' in body_unicode, repr(body_unicode)
        assert u'PREFIX' in body_unicode, repr(body_unicode)
        assert u'SUFFIX' in body_unicode, repr(body_unicode)

        # Do not destroy html tags due to encoding bugs
        encoding, body_unicode = html_to_unicode(ct('utf-8'),
            '\xf0<span>value</span>')
        assert u'<span>value</span>' in body_unicode, repr(body_unicode)
Exemple #2
0
 def test_gunzip_illegal_eof(self):
     with open(join(SAMPLEDIR, "unexpected-eof.gz"), "rb") as f:
         text = html_to_unicode("charset=cp1252", gunzip(f.read()))[1]
         with open(join(SAMPLEDIR, "unexpected-eof-output.txt"), "rb") as o:
             expected_text = o.read().decode("utf-8")
             self.assertEqual(len(text), len(expected_text))
             self.assertEqual(text, expected_text)
Exemple #3
0
 def _assert_encoding(self, content_type, body, expected_encoding,
             expected_unicode):
     encoding, body_unicode = html_to_unicode(ct(content_type), body)
     self.assertTrue(isinstance(body_unicode, unicode))
     self.assertEqual(norm_encoding(encoding),
             norm_encoding(expected_encoding))
     self.assertEqual(body_unicode, expected_unicode)
Exemple #4
0
 def test_unicode_body(self):
     unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
     original_string = unicode_string.encode('cp1251')
     encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
     # check body_as_unicode
     self.assertTrue(isinstance(body_unicode, unicode))
     self.assertEqual(body_unicode, unicode_string)
Exemple #5
0
    def extract(self, html='', **kwargs):
        """
        extract data field from raw html or from a url.
        """
        if not html and 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'), info.read())

        builder = DomTreeBuilder(html)
        root = builder.build()

        region_finder = MiningDataRegion(root, self.k, self.threshold)
        regions = region_finder.find_regions(root)

        record_finder = MiningDataRecord(self.threshold)
        field_finder = MiningDataField()

        for region in regions:
            records = record_finder.find_records(region)
            items, _ = field_finder.align_records(records)
            region.items = items
            if 'verbose' in kwargs:
                print region
                for record in records:
                    print '\t', record

        return regions
Exemple #6
0
def url_to_page(url, encoding=None, default_encoding='utf-8'):
    """Fetch a URL, using python urllib2, and return an HtmlPage object.

    The `url` may be a string, or a `urllib2.Request` object. The `encoding`
    argument can be used to force the interpretation of the page encoding.

    Redirects are followed, and the `url` property of the returned HtmlPage object
    is the url of the final page redirected to.

    If the encoding of the page is known, it can be passed as a keyword argument. If
    unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
    `default_encoding` is used if the encoding cannot be determined.
    """
    fh = urlopen(url)
    info = fh.info()
    body_str = fh.read()
    # guess content encoding if not specified
    if encoding is None:
        try:
            # Python 3.x
            content_type_header = fh.getheader("content-type")
        except AttributeError:
            # Python 2.x
            content_type_header = info.getheader("content-type")
        encoding, body = html_to_unicode(content_type_header, body_str,
                default_encoding=default_encoding)
    else:
        body = body_str.decode(encoding)
    return HtmlPage(fh.geturl(), headers=dict(info.items()), body=body, encoding=encoding)
Exemple #7
0
    def factory(self,data, parser_cls,url):
        charset = 'charset=%s' % 'utf-8'
        data = html_to_unicode(charset, data)[1]
        body = data.encode('utf8') or '<html/>'


        parser = parser_cls(recover=True, encoding='utf8')
        return etree.fromstring(body, parser=parser, base_url=url)
Exemple #8
0
 def body_as_unicode(self):
     """Return body as unicode"""
     # check for self.encoding before _cached_ubody just in
     # _body_inferred_encoding is called
     benc = self.encoding
     if self._cached_ubody is None:
         charset = 'charset=%s' % benc
         self._cached_ubody = html_to_unicode(charset, self.body)[1]
     return self._cached_ubody
Exemple #9
0
 def text(self):
     """ Body as unicode """
     # access self.encoding before _cached_ubody to make sure
     # _body_inferred_encoding is called
     benc = self.encoding
     if self._cached_ubody is None:
         charset = 'charset=%s' % benc
         self._cached_ubody = html_to_unicode(charset, self.body)[1]
     return self._cached_ubody
Exemple #10
0
 def _body_inferred_encoding(self):
     if self._cached_benc is None:
         content_type = to_native_str(self.headers.get(b'Content-Type', b''))
         benc, ubody = html_to_unicode(content_type, self.body,
                 auto_detect_fun=self._auto_detect_fun,
                 default_encoding=self._DEFAULT_ENCODING)
         self._cached_benc = benc
         self._cached_ubody = ubody
     return self._cached_benc
Exemple #11
0
 def body_as_unicode(self):
     from w3lib.encoding import html_to_unicode, resolve_encoding, \
 html_body_declared_encoding, http_content_type_encoding
     """Return body as unicode"""
     # check for self.encoding before _cached_ubody just in
     # _body_inferred_encoding is called
     benc = self.encoding
     charset = 'charset=%s' % benc
     self._cached_ubody = html_to_unicode(charset, self.content)[1]
     return self._cached_ubody
Exemple #12
0
def response2unicode(resp):
    """
    Convert requests.Response body to unicode.
    Unlike ``response.text`` it handles <meta> tags in response content.
    """
    enc, html = html_to_unicode(
        content_type_header=resp.headers.get("Content-Type"),
        html_body_str=resp.content,
        auto_detect_fun=_autodetect_encoding,
    )
    return html
    def encoding(self) -> str:
        """The encoding string to be used, extracted from the HTML and
        :class:`HTMLResponse <HTMLResponse>` headers.
        """
        if self._encoding:
            return self._encoding

        # Scan meta tags for chaset.
        if self._html:
            self._encoding = html_to_unicode(self.default_encoding, self._html)[0]

        return self._encoding if self._encoding else self.default_encoding
Exemple #14
0
    def infer(self, html='', **kwargs):
        """
        extract data with seed region and the data you expect to scrape from there.
        """
        if 'url' in kwargs:
            info = urlopen(kwargs.pop('url'))
            _, html = html_to_unicode(info.headers.get('content_type'), info.read())

        builder = DomTreeBuilder(html)
        doc = builder.build()
        page = HtmlPage(body=tostring(doc, encoding=unicode, method='html'))

        return self.scraper.scrape_page(page)
Exemple #15
0
    def _assert_encoding(self, content_type, body, expected_encoding,
                expected_unicode):
        encoding, body_unicode = html_to_unicode(ct(content_type), body)
        self.assertTrue(isinstance(body_unicode, unicode))
        self.assertEqual(norm_encoding(encoding),
                norm_encoding(expected_encoding))

        if isinstance(expected_unicode, basestring):
            self.assertEqual(body_unicode, expected_unicode)
        else:
            self.assertTrue(
                body_unicode in expected_unicode,
                "%s is not in %s" % (body_unicode, expected_unicode)
            )
Exemple #16
0
    def extract(self, html="", **kwargs):
        """
        extract data regions from raw html or from a url.
        """
        if "url" in kwargs:
            info = urlopen(kwargs.pop("url"))
            _, html = encoding.html_to_unicode(info.headers.get("content_type"), info.read())
        builder = DomTreeBuilder(html)
        root = builder.build()

        mining_region = MiningDataRegion(root, self.k, self.threshold)
        regions = mining_region.find_regions(root)

        mining_record = MiningDataRecord()
        mining_field = MiningDataField()

        region_records = {}
        all_items = []
        for i, region in enumerate(regions):
            records = mining_record.find_records(region)
            items, _ = mining_field.align_records(records)
            all_items.extend(items)
            assert len(items) == len(records)
            region_records.update({region: records})

            if "verbose" in kwargs:
                print region
                for record in records:
                    print "\t", record

        # always annotate at last to avoid modify the DOM tree
        if "annotate" in kwargs:
            for i, region in enumerate(regions):
                for j, record in enumerate(region_records.get(region)):
                    self.annotate(i, j, record.elements)

            with open(kwargs.pop("annotate"), "w") as f:
                print >> f, tostring(root, pretty_print=True)

        return all_items
Exemple #17
0
def _decode_bytes(body, content_type='', default_encoding='utf-8'):
    encoding, uni_string = html_to_unicode(content_type_header=content_type,
                                           html_body_str=body,
                                           default_encoding=default_encoding,
                                           auto_detect_fun=_detect_encoding)
    return (encoding, uni_string)
Exemple #18
0
            with open(kwargs.pop("annotate"), "w") as f:
                print >> f, tostring(root, pretty_print=True)

        return all_items

    def annotate(self, region, record, elements):
        """
        annotate the HTML elements with PyQuery.
        """
        colors = ["#ffff42", "#ff0000", "#00ff00", "#ff00ff"]
        p = pq(elements[0])
        div = p.wrap(
            '<div class="mdr_region" region_id={} record_id={} style="color:{}; border:solid 5px"></div>'.format(
                region, record, choice(colors)
            )
        )
        for e in elements[1:]:
            div.append(e)


if __name__ == "__main__":
    import sys

    info = urlopen(sys.argv[1])
    _, html = encoding.html_to_unicode(info.headers.get("content_type"), info.read())
    depta = Depta()

    items = depta.extract(html, annotate="output.html", verbose=True)
    for i, item in enumerate(items):
        print i, " | ".join(map(lambda x: x.text, item.fields))
Exemple #19
0
 def _assert_encoding_detected(self, content_type, expected_encoding, body,
         **kwargs):
     assert not isinstance(body, six.text_type)
     encoding, body_unicode  = html_to_unicode(ct(content_type), body, **kwargs)
     self.assertTrue(isinstance(body_unicode, six.text_type))
     self.assertEqual(norm_encoding(encoding),  norm_encoding(expected_encoding))
Exemple #20
0
 def _assert_encoding_detected(self, content_type, expected_encoding, body,
         **kwargs):
     encoding, body_unicode  = html_to_unicode(ct(content_type), body, **kwargs)
     self.assertTrue(isinstance(body_unicode, unicode))
     self.assertEqual(norm_encoding(encoding),  norm_encoding(expected_encoding))
Exemple #21
0
                for j, record in enumerate(region_records.get(region)):
                    self.annotate(i, j, record.elements)

            with open(kwargs.pop('annotate'), 'w') as f:
                print >> f, tostring(root, pretty_print=True)

        return all_items

    def annotate(self, region, record, elements):
        """
        annotate the HTML elements with PyQuery.
        """
        colors = ['#ffff42', '#ff0000', '#00ff00', '#ff00ff']
        p = pq(elements[0])
        div = p.wrap(
            '<div class="mdr_region" region_id={} record_id={} style="color:{}; border:solid 5px"></div>'
            .format(region, record, choice(colors)))
        for e in elements[1:]:
            div.append(e)


if __name__ == '__main__':
    import sys
    info = urlopen(sys.argv[1])
    _, html = encoding.html_to_unicode(info.headers.get('content_type'),
                                       info.read())
    depta = Depta()

    items = depta.extract(html, annotate='output.html', verbose=True)
    for i, item in enumerate(items):
        print i, ' | '.join(map(lambda x: x.text, item.fields))
        safe_attrs_only=False
    )
    parser = HTMLParser(encoding=encoding)
    html = lxml.html.document_fromstring(html, parser=parser)
    doc = cleaner.clean_html(html)
    return lxml.etree.tounicode(doc)


def mkdir(path):
    try:
        os.makedirs(path)
    except OSError:
        pass


if __name__ == '__main__':
    args = docopt(__doc__)

    mkdir(args['--out'])
    for in_name in args['<input>']:
        path, fname = os.path.split(in_name)
        out_name = os.path.join(args['--out'], fname)

        with open(in_name, 'rb') as f:
            encoding, html = html_to_unicode(None, f.read())

        cleaned = clean_html(html.encode(encoding), encoding)

        with codecs.open(out_name, 'w', encoding='utf8') as out:
            out.write(cleaned)
Exemple #23
0
                                      meta=False,
                                      safe_attrs_only=False)
    parser = HTMLParser(encoding=encoding)
    html = lxml.html.document_fromstring(html, parser=parser)
    doc = cleaner.clean_html(html)
    return lxml.etree.tounicode(doc)


def mkdir(path):
    try:
        os.makedirs(path)
    except OSError:
        pass


if __name__ == '__main__':
    args = docopt(__doc__)

    mkdir(args['--out'])
    for in_name in args['<input>']:
        path, fname = os.path.split(in_name)
        out_name = os.path.join(args['--out'], fname)

        with open(in_name, 'rb') as f:
            encoding, html = html_to_unicode(None, f.read())

        cleaned = clean_html(html.encode(encoding), encoding)

        with codecs.open(out_name, 'w', encoding='utf8') as out:
            out.write(cleaned)
Exemple #24
0
 def _decoding(self):
     charset = f'charset={self._encoding}'
     enc, text = html_to_unicode(charset, self.body)
     self._encoding = enc
     self._text = text