Example #1
0
 def test_site_samples(self):
     """test parse_html from real cases"""
     count = 0
     fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
     while os.path.exists(fname):
         source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read())
         parsed = json.loads(str_to_unicode(open(fname, "rb").read()),\
                 object_hook=_decode_element)
         self._test_sample(source, parsed, count)
         count += 1
         fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
Example #2
0
def replace_escape_chars(text, which_ones=('\n','\t','\r'), replace_by=u''):
    """ Remove escape chars. Default : \\n, \\t, \\r

        which_ones -- is a tuple of which escape chars we want to remove.
                      By default removes \n, \t, \r.

        replace_by -- text to replace the escape chars for.
                      It defaults to '', so the escape chars are removed.
    """
    for ec in which_ones:
        text = text.replace(ec, str_to_unicode(replace_by))
    return str_to_unicode(text)
Example #3
0
def response_to_dict(response):
    """Convert Response object to a dict"""
    d = {
        'url': str_to_unicode(response.url, errors='ignore'),
        'headers': dict(response.headers),
        'body': str_to_unicode(response.body, errors='ignore'),
        'encoding': response.encoding,
        'status': response.status,
        'request': request_to_dict(response.request),
        'meta': response.request.meta
    }
    return d
    def test_str_to_unicode(self):
        # converting an utf-8 encoded string to unicode
        self.assertEqual(str_to_unicode('lel\xc3\xb1e'), u'lel\xf1e')

        # converting a latin-1 encoded string to unicode
        self.assertEqual(str_to_unicode('lel\xf1e', 'latin-1'), u'lel\xf1e')

        # converting a unicode to unicode should return the same object
        self.assertEqual(str_to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e')

        # converting a strange object should raise TypeError
        self.assertRaises(TypeError, str_to_unicode, 423)

        # check errors argument works
        assert u'\ufffd' in str_to_unicode('a\xedb', 'utf-8', errors='replace')
Example #5
0
def unquote_markup(text, keep=(), remove_illegal=True):
    """
    This function receives markup as a text (always a unicode string or a utf-8 encoded string) and does the following:
     - removes entities (except the ones in 'keep') from any part of it that it's not inside a CDATA
     - searches for CDATAs and extracts their text (if any) without modifying it.
     - removes the found CDATAs
    """
    _cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)

    def _get_fragments(txt, pattern):
        offset = 0
        for match in pattern.finditer(txt):
            match_s, match_e = match.span(1)
            yield txt[offset:match_s]
            yield match
            offset = match_e
        yield txt[offset:]

    text = str_to_unicode(text)
    ret_text = u''
    for fragment in _get_fragments(text, _cdata_re):
        if isinstance(fragment, basestring):
            # it's not a CDATA (so we try to remove its entities)
            ret_text += remove_entities(fragment, keep=keep, remove_illegal=remove_illegal)
        else:
            # it's a CDATA (so we just extract its content)
            ret_text += fragment.group('cdata_d')
    return ret_text
Example #6
0
def remove_tags(text, which_ones=(), keep=(), encoding=None):
    """ Remove HTML Tags only. 

        which_ones and keep are both tuples, there are four cases:

        which_ones, keep (1 - not empty, 0 - empty)
        1, 0 - remove all tags in which_ones
        0, 1 - remove all tags except the ones in keep
        0, 0 - remove all tags
        1, 1 - not allowd
    """

    assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'

    def will_remove(tag):
        if which_ones:
            return tag in which_ones
        else:
            return tag not in keep

    def remove_tag(m):
        tag = m.group(1)
        return u'' if will_remove(tag) else m.group(0)

    regex = '</?([^ >/]+).*?>'
    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)

    return retags.sub(remove_tag, str_to_unicode(text, encoding))
Example #7
0
def replace_tags(text, token=''):
    """Replace all markup tags found in the given text by the given token. By
    default token is a null string so it just remove all tags.

    'text' can be a unicode string or a regular string encoded as 'utf-8'

    Always returns a unicode string.
    """
    return _tag_re.sub(token, str_to_unicode(text))
Example #8
0
def remove_tags_with_content(text, which_ones=()):
    """ Remove tags and its content.
        
        which_ones -- is a tuple of which tags with its content we want to remove.
                      if is empty do nothing.
    """
    text = str_to_unicode(text)
    if which_ones:
        tags = '|'.join(['<%s.*?</%s>' % (tag,tag) for tag in which_ones])
        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
        text = retags.sub(u'', text)
    return text
Example #9
0
def remove_tags(text, which_ones=()):
    """ Remove HTML Tags only. 

        which_ones -- is a tuple of which tags we want to remove.
                      if is empty remove all tags.
    """
    if which_ones:
        tags = ['<%s>|<%s .*?>|</%s>' % (tag,tag,tag) for tag in which_ones]
        regex = '|'.join(tags)
    else:
        regex = '<.*?>'
    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)

    return retags.sub(u'', str_to_unicode(text))
Example #10
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser) 

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)

        return ret
Example #11
0
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
    """Remove entities from the given text.

    'text' can be a unicode string or a regular string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If 'keep' is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric (&#nnnn; and &#hhhh;) and named (&nbsp; &gt;)
    entities.

    If remove_illegal is True, entities that can't be converted are removed.
    If remove_illegal is False, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).
    """

    def convert_entity(m):
        entity_body = m.group(3)
        if m.group(1):
            try:
                if m.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                if 0x80 <= number <= 0x9f:
                    return chr(number).decode('cp1252')
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_body)
        if number is not None:
            try:
                return unichr(number)
            except ValueError:
                pass

        return u'' if remove_illegal else m.group(0)

    return _ent_re.sub(convert_entity, str_to_unicode(text, encoding))
Example #12
0
    def _extract_links(self, response_text, response_url, response_encoding):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)

        return ret
Example #13
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
     count = 0
     fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
     while os.path.exists(fname):
         source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read())
         annotations = json.loads(str_to_unicode(open(fname, "rb").read()))
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
         count += 1
         fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
Example #14
0
    def test_jsonrpc_client_call_request(self):
        sentcall = {}
        def _urlopen(url, data):
            sentcall['url'] = url
            sentcall['data'] = data
            return _umock(1)

        with patch.object(urllib.request, 'urlopen', _urlopen):
            jsonrpc_client_call('url', 'test', 'one', 2)
            req = json.loads(str_to_unicode(sentcall['data']))
            assert 'id' in req
            self.assertEqual(sentcall['url'], 'url')
            self.assertEqual(req['jsonrpc'], '2.0')
            self.assertEqual(req['method'], 'test')
            self.assertEqual(req['params'], ['one', 2])
Example #15
0
def create_page_from_jsonpage(jsonpage, body_key):
    """Create an HtmlPage object from a dict object conforming to the schema
    for a page

    `body_key` is the key where the body is stored and can be either 'body'
    (original page with annotations - if any) or 'original_body' (original
    page, always). Classification typically uses 'original_body' to avoid
    confusing the classifier with annotated pages, while extraction uses 'body'
    to pass the annotated pages.
    """
    url = jsonpage['url']
    headers = jsonpage.get('headers')
    body = str_to_unicode(jsonpage[body_key])
    page_id = jsonpage.get('page_id')
    return HtmlPage(url, headers, body, page_id)
Example #16
0
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            link.url = urljoin(base_url, link.url)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace').strip()
            ret.append(link)

        return ret
Example #17
0
    def _extract_links(self, response_text, response_url, response_encoding):
        links = []
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.tag_func(e.tag):
                if self.attr_func(a):
                    l = safe_url_string(l, response_encoding)
                    text = u''
                    if e.text:
                        text = str_to_unicode(e.text, response_encoding, errors='replace').strip()
                    link = Link(self.process_func(l), text=text)
                    links.append(link)

        links = unique_list(links, key=lambda link: link.url) \
                if self.unique else links

        return links
Example #18
0
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        #self.reset()
        #self.feed(response_text)
        #self.close()
        html = lxml.etree.HTML(response_text)
        links = html.xpath("//a")
        self.links = [Link(link.get("href") or "", link.text or "") for link in links]

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            link.url = urljoin(base_url, link.url.strip())
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
            ret.append(link)

        return ret
Example #19
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     samples_file = open(os.path.join(path, "samples_pageparsing.json.gz"), "r")
     samples = []
     for line in GzipFile(fileobj=StringIO(samples_file.read())).readlines():
         samples.append(json.loads(line))
     for sample in samples:
         source = sample["annotated"]
         annotations = sample["annotations"]
         template = HtmlPage(body=str_to_unicode(source))
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
Example #20
0
 def _fix_link_text_encoding(self, encoding):
     """Convert link_text to unicode for each request"""
     for req in self.requests:
         req.meta.setdefault('link_text', '')
         req.meta['link_text'] = str_to_unicode(req.meta['link_text'],
                                                encoding) 
Example #21
0
def has_entities(text):
    return bool(_ent_re.search(str_to_unicode(text)))
Example #22
0
 def _getrow(csv_r):
     return [str_to_unicode(field, encoding) for field in next(csv_r)]
 def _assert_expected_item(self, exported_dict):
     for k, v in exported_dict.items():
         exported_dict[k] = str_to_unicode(v)
     self.assertEqual(self.i, exported_dict)
Example #24
0
def remove_comments(text):
    """ Remove HTML Comments. """
    return re.sub('<!--.*?-->', u'', str_to_unicode(text), re.DOTALL)
Example #25
0
 def _getrow(csv_r):
     return [str_to_unicode(field, encoding) for field in next(csv_r)]
Example #26
0
 def _assert_expected_item(self, exported_dict):
     for k, v in exported_dict.items():
         exported_dict[k] = str_to_unicode(v)
     self.assertEqual(self.i, exported_dict)
Example #27
0
def has_entities(text, encoding=None):
    return bool(_ent_re.search(str_to_unicode(text, encoding)))
Example #28
0
 def _fix_link_text_encoding(self, encoding):
     """Convert link_text to unicode for each request"""
     for req in self.requests:
         req.meta.setdefault('link_text', '')
         req.meta['link_text'] = str_to_unicode(req.meta['link_text'],
                                                encoding)