def rex_text_list(body, rex, flags=0): """ Return found matches with stripped tags. """ items = [] for match in rex_list(body, rex, flags=flags): items.append(normalize_space(decode_entities(match.group(1)))) return items
def find_content_blocks(tree, min_length=None): """ Iterate over content blocks (russian version) """ from lxml.html import tostring from lxml.etree import strip_tags, strip_elements, Comment # First, make a copy of DOM-tree to not harm external code tree = deepcopy(tree) # Completely remove content of following tags nondata_tags = ['head', 'style', 'script'] strip_elements(tree, *nondata_tags) # Remove comment nodes (keep tail text) strip_tags(tree, Comment) # Remove links strip_tags(tree, 'a') # Drop inline tags inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a', 'span', 'font') strip_tags(tree, *inline_tags) # Drop media tags media_tags = ('img',) strip_tags(tree, *media_tags) body = tostring(tree, encoding='utf-8').decode('utf-8') # Normalize spaces body = normalize_space(body) # Remove ALL chars from tags re_tag = re.compile(r'<[^>]+>') body = re_tag.sub(r'<>', body) #with open('/tmp/log.html', 'w') as out: #out.write(body.encode('utf-8')) #return # Find text blocks block_rex = re.compile(r'[^<>]+') blocks = [] for match in block_rex.finditer(body): block = match.group(0) if min_length is None or len(block) >= min_length: ratio = _trash_ratio(block) if ratio < 0.05: words = block.split() if not any(len(x) > 50 for x in words): blocks.append(block) return blocks
def find_content_blocks(tree, min_length=None): """ Iterate over content blocks (russian version) """ from lxml.html import tostring from lxml.etree import strip_tags, strip_elements, Comment # First, make a copy of DOM-tree to not harm external code tree = deepcopy(tree) # Completely remove content of following tags nondata_tags = ['head', 'style', 'script'] strip_elements(tree, *nondata_tags) # Remove comment nodes (keep tail text) strip_tags(tree, Comment) # Remove links strip_tags(tree, 'a') # Drop inline tags inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a', 'span', 'font') strip_tags(tree, *inline_tags) # Drop media tags media_tags = ('img', ) strip_tags(tree, *media_tags) body = tostring(tree, encoding='utf-8').decode('utf-8') # Normalize spaces body = normalize_space(body) # Remove ALL chars from tags re_tag = re.compile(r'<[^>]+>') body = re_tag.sub(r'<>', body) #with open('/tmp/log.html', 'w') as out: #out.write(body.encode('utf-8')) #return # Find text blocks block_rex = re.compile(r'[^<>]+') blocks = [] for match in block_rex.finditer(body): block = match.group(0) if min_length is None or len(block) >= min_length: ratio = _trash_ratio(block) if ratio < 0.05: words = block.split() if not any(len(x) > 50 for x in words): blocks.append(block) return blocks
def test_normalize_space(self): self.assertEqual('', normalize_space(' ')) self.assertEqual('f', normalize_space(' f ')) self.assertEqual('f b', normalize_space(' f b ')) self.assertEqual(u'тр и гла за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ')) self.assertEqual(u'тр_и_гла_за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_')) self.assertEqual(u'трABCиABCглаABCза', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='ABC'))
def rex_text(body, regexp, flags=0, default=NULL): """ Search `regexp` expression in `body` text and then strip tags in found result. """ match = rex(body, regexp, flags=flags, default=default) try: return normalize_space(decode_entities(match.group(1))) except AttributeError: if default is NULL: raise DataNotFound('Regexp not found') else: return default
def rex_text(self, regexp, flags=0, byte=False, default=NULL): """ Search regular expression in response body and return content of first matching group. :param byte: if False then search is performed in `response.unicode_body()` else the rex is searched in `response.body`. """ try: match = self.rex_search(regexp, flags=flags, byte=byte) except DataNotFound: if default is NULL: raise DataNotFound('Regexp not found') else: return default else: return normalize_space(decode_entities(match.group(1)))
def test_normalize_space(self): self.assertEqual('', normalize_space(' ')) self.assertEqual('f', normalize_space(' f ')) self.assertEqual('f b', normalize_space(' f b ')) self.assertEqual(u'тр и гла за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ')) self.assertEqual( u'тр_и_гла_за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_')) self.assertEqual( u'трABCиABCглаABCза', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='ABC'))