def test_site_samples(self): """test parse_html from real cases""" count = 0 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count) while os.path.exists(fname): source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()) parsed = json.loads(str_to_unicode(open(fname, "rb").read()),\ object_hook=_decode_element) self._test_sample(source, parsed, count) count += 1 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
def replace_escape_chars(text, which_ones=('\n','\t','\r'), replace_by=u''): """ Remove escape chars. Default : \\n, \\t, \\r which_ones -- is a tuple of which escape chars we want to remove. By default removes \n, \t, \r. replace_by -- text to replace the escape chars for. It defaults to '', so the escape chars are removed. """ for ec in which_ones: text = text.replace(ec, str_to_unicode(replace_by)) return str_to_unicode(text)
def response_to_dict(response): """Convert Response object to a dict""" d = { 'url': str_to_unicode(response.url, errors='ignore'), 'headers': dict(response.headers), 'body': str_to_unicode(response.body, errors='ignore'), 'encoding': response.encoding, 'status': response.status, 'request': request_to_dict(response.request), 'meta': response.request.meta } return d
def test_str_to_unicode(self): # converting an utf-8 encoded string to unicode self.assertEqual(str_to_unicode('lel\xc3\xb1e'), u'lel\xf1e') # converting a latin-1 encoded string to unicode self.assertEqual(str_to_unicode('lel\xf1e', 'latin-1'), u'lel\xf1e') # converting a unicode to unicode should return the same object self.assertEqual(str_to_unicode(u'\xf1e\xf1e\xf1e'), u'\xf1e\xf1e\xf1e') # converting a strange object should raise TypeError self.assertRaises(TypeError, str_to_unicode, 423) # check errors argument works assert u'\ufffd' in str_to_unicode('a\xedb', 'utf-8', errors='replace')
def unquote_markup(text, keep=(), remove_illegal=True): """ This function receives markup as a text (always a unicode string or a utf-8 encoded string) and does the following: - removes entities (except the ones in 'keep') from any part of it that it's not inside a CDATA - searches for CDATAs and extracts their text (if any) without modifying it. - removes the found CDATAs """ _cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL) def _get_fragments(txt, pattern): offset = 0 for match in pattern.finditer(txt): match_s, match_e = match.span(1) yield txt[offset:match_s] yield match offset = match_e yield txt[offset:] text = str_to_unicode(text) ret_text = u'' for fragment in _get_fragments(text, _cdata_re): if isinstance(fragment, basestring): # it's not a CDATA (so we try to remove its entities) ret_text += remove_entities(fragment, keep=keep, remove_illegal=remove_illegal) else: # it's a CDATA (so we just extract its content) ret_text += fragment.group('cdata_d') return ret_text
def remove_tags(text, which_ones=(), keep=(), encoding=None): """ Remove HTML Tags only. which_ones and keep are both tuples, there are four cases: which_ones, keep (1 - not empty, 0 - empty) 1, 0 - remove all tags in which_ones 0, 1 - remove all tags except the ones in keep 0, 0 - remove all tags 1, 1 - not allowd """ assert not (which_ones and keep), 'which_ones and keep can not be given at the same time' def will_remove(tag): if which_ones: return tag in which_ones else: return tag not in keep def remove_tag(m): tag = m.group(1) return u'' if will_remove(tag) else m.group(0) regex = '</?([^ >/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) return retags.sub(remove_tag, str_to_unicode(text, encoding))
def replace_tags(text, token=''): """Replace all markup tags found in the given text by the given token. By default token is a null string so it just remove all tags. 'text' can be a unicode string or a regular string encoded as 'utf-8' Always returns a unicode string. """ return _tag_re.sub(token, str_to_unicode(text))
def remove_tags_with_content(text, which_ones=()): """ Remove tags and its content. which_ones -- is a tuple of which tags with its content we want to remove. if is empty do nothing. """ text = str_to_unicode(text) if which_ones: tags = '|'.join(['<%s.*?</%s>' % (tag,tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) text = retags.sub(u'', text) return text
def remove_tags(text, which_ones=()): """ Remove HTML Tags only. which_ones -- is a tuple of which tags we want to remove. if is empty remove all tags. """ if which_ones: tags = ['<%s>|<%s .*?>|</%s>' % (tag,tag,tag) for tag in which_ones] regex = '|'.join(tags) else: regex = '<.*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) return retags.sub(u'', str_to_unicode(text))
def _extract_links(self, response_text, response_url, response_encoding): self.base_url, self.links = etree.HTML(response_text, self.parser) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding) ret.append(link) return ret
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): """Remove entities from the given text. 'text' can be a unicode string or a regular string encoded in the given `encoding` (which defaults to 'utf-8'). If 'keep' is passed (with a list of entity names) those entities will be kept (they won't be removed). It supports both numeric (&#nnnn; and &#hhhh;) and named ( >) entities. If remove_illegal is True, entities that can't be converted are removed. If remove_illegal is False, entities that can't be converted are kept "as is". For more information see the tests. Always returns a unicode string (with the entities removed). """ def convert_entity(m): entity_body = m.group(3) if m.group(1): try: if m.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9f: return chr(number).decode('cp1252') except ValueError: number = None else: if entity_body in keep: return m.group(0) else: number = name2codepoint.get(entity_body) if number is not None: try: return unichr(number) except ValueError: pass return u'' if remove_illegal else m.group(0) return _ent_re.sub(convert_entity, str_to_unicode(text, encoding))
def _extract_links(self, response_text, response_url, response_encoding): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url for link in self.links: link.url = urljoin_rfc(base_url, link.url, response_encoding) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding) ret.append(link) return ret
def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures """ SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing") count = 0 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count) while os.path.exists(fname): source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()) annotations = json.loads(str_to_unicode(open(fname, "rb").read())) template = HtmlPage(body=source) parser = TemplatePageParser(TokenDict()) parser.feed(template) for annotation in parser.annotations: test_annotation = annotations.pop(0) for s in annotation.__slots__: if s == "tag_attributes": for pair in getattr(annotation, s): self.assertEqual(list(pair), test_annotation[s].pop(0)) else: self.assertEqual(getattr(annotation, s), test_annotation[s]) self.assertEqual(annotations, []) count += 1 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
def test_jsonrpc_client_call_request(self): sentcall = {} def _urlopen(url, data): sentcall['url'] = url sentcall['data'] = data return _umock(1) with patch.object(urllib.request, 'urlopen', _urlopen): jsonrpc_client_call('url', 'test', 'one', 2) req = json.loads(str_to_unicode(sentcall['data'])) assert 'id' in req self.assertEqual(sentcall['url'], 'url') self.assertEqual(req['jsonrpc'], '2.0') self.assertEqual(req['method'], 'test') self.assertEqual(req['params'], ['one', 2])
def create_page_from_jsonpage(jsonpage, body_key): """Create an HtmlPage object from a dict object conforming to the schema for a page `body_key` is the key where the body is stored and can be either 'body' (original page with annotations - if any) or 'original_body' (original page, always). Classification typically uses 'original_body' to avoid confusing the classifier with annotated pages, while extraction uses 'body' to pass the annotated pages. """ url = jsonpage['url'] headers = jsonpage.get('headers') body = str_to_unicode(jsonpage[body_key]) page_id = jsonpage.get('page_id') return HtmlPage(url, headers, body, page_id)
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ self.reset() self.feed(response_text) self.close() ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) link.url = urljoin(base_url, link.url) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace').strip() ret.append(link) return ret
def _extract_links(self, response_text, response_url, response_encoding): links = [] html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): l = safe_url_string(l, response_encoding) text = u'' if e.text: text = str_to_unicode(e.text, response_encoding, errors='replace').strip() link = Link(self.process_func(l), text=text) links.append(link) links = unique_list(links, key=lambda link: link.url) \ if self.unique else links return links
def _extract_links(self, response_text, response_url, response_encoding, base_url=None): """ Do the real extraction work """ #self.reset() #self.feed(response_text) #self.close() html = lxml.etree.HTML(response_text) links = html.xpath("//a") self.links = [Link(link.get("href") or "", link.text or "") for link in links] ret = [] if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url for link in self.links: if isinstance(link.url, unicode): link.url = link.url.encode(response_encoding) link.url = urljoin(base_url, link.url.strip()) link.url = safe_url_string(link.url, response_encoding) link.text = str_to_unicode(link.text, response_encoding, errors='replace') ret.append(link) return ret
def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures """ samples_file = open(os.path.join(path, "samples_pageparsing.json.gz"), "r") samples = [] for line in GzipFile(fileobj=StringIO(samples_file.read())).readlines(): samples.append(json.loads(line)) for sample in samples: source = sample["annotated"] annotations = sample["annotations"] template = HtmlPage(body=str_to_unicode(source)) parser = TemplatePageParser(TokenDict()) parser.feed(template) for annotation in parser.annotations: test_annotation = annotations.pop(0) for s in annotation.__slots__: if s == "tag_attributes": for pair in getattr(annotation, s): self.assertEqual(list(pair), test_annotation[s].pop(0)) else: self.assertEqual(getattr(annotation, s), test_annotation[s]) self.assertEqual(annotations, [])
def _fix_link_text_encoding(self, encoding): """Convert link_text to unicode for each request""" for req in self.requests: req.meta.setdefault('link_text', '') req.meta['link_text'] = str_to_unicode(req.meta['link_text'], encoding)
def has_entities(text): return bool(_ent_re.search(str_to_unicode(text)))
def _getrow(csv_r): return [str_to_unicode(field, encoding) for field in next(csv_r)]
def _assert_expected_item(self, exported_dict): for k, v in exported_dict.items(): exported_dict[k] = str_to_unicode(v) self.assertEqual(self.i, exported_dict)
def remove_comments(text): """ Remove HTML Comments. """ return re.sub('<!--.*?-->', u'', str_to_unicode(text), re.DOTALL)
def has_entities(text, encoding=None): return bool(_ent_re.search(str_to_unicode(text, encoding)))