def parse_template_main(self): proceedings_list = [] tr = self.grab.tree.xpath(XPATH_SUMMARY) for i in range(0, len(tr), 2): href = tr[i].find(self.XPATH_SUMMARY_TITLE) try: if href.get('href') in config.input_urls or len(config.input_urls) == 1: proceedings = dict() proceedings['volume_number'] = ProceedingsSummaryParser.extract_volume_number(href.get('href')) proceedings['url'] = href.get('href') summary_match = rex.rex( tr[i + 1].find('.//td[last()]').text_content(), r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)', re.I | re.M | re.S) proceedings['label'] = re.sub(r'\n', '', text.normalize_space(summary_match.group(1), ' \n')) proceedings['editors'] = re.split(r",+\s*", text.normalize_space(summary_match.group(3))) proceedings['submission_date'] = datetime.strptime( text.normalize_space(summary_match.group(7), ' \n'), '%d-%b-%Y') proceedings_list.append(proceedings) except: print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get('href') #traceback.print_exc() self.data['proceedings_list'] = proceedings_list if len(proceedings_list) == 0: raise DataNotFound("There is no summary information to parse!")
def text(self, default=NULL): try: return normalize_space(decode_entities(self.one().group(1))) except (AttributeError, IndexError): if default is NULL: raise else: return default
def rex_text_list(body, rex, flags=0): """ Return found matches with stripped tags. """ items = [] for match in rex_list(body, rex, flags=flags): items.append(normalize_space(decode_entities(match.group(1)))) return items
def find_content_blocks(tree, min_length=None): """ Iterate over content blocks (russian version) """ from lxml.html import tostring from lxml.etree import strip_tags, strip_elements, Comment # First, make a copy of DOM-tree to not harm external code tree = deepcopy(tree) # Completely remove content of following tags nondata_tags = ['head', 'style', 'script'] strip_elements(tree, *nondata_tags) # Remove comment nodes (keep tail text) strip_tags(tree, Comment) # Remove links strip_tags(tree, 'a') # Drop inline tags inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a', 'span', 'font') strip_tags(tree, *inline_tags) # Drop media tags media_tags = ('img',) strip_tags(tree, *media_tags) body = tostring(tree, encoding='utf-8').decode('utf-8') # Normalize spaces body = normalize_space(body) # Remove ALL chars from tags re_tag = re.compile(r'<[^>]+>') body = re_tag.sub(r'<>', body) #with open('/tmp/log.html', 'w') as out: #out.write(body.encode('utf-8')) #return # Find text blocks block_rex = re.compile(r'[^<>]+') blocks = [] for match in block_rex.finditer(body): block = match.group(0) if min_length is None or len(block) >= min_length: ratio = _trash_ratio(block) if ratio < 0.05: words = block.split() if not any(len(x) > 50 for x in words): blocks.append(block) return blocks
def test_normalize_space(self): self.assertEqual('', normalize_space(' ')) self.assertEqual('f', normalize_space(' f ')) self.assertEqual('f b', normalize_space(' f b ')) self.assertEqual(u'тр и гла за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ')) self.assertEqual(u'тр_и_гла_за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_')) self.assertEqual(u'трABCиABCглаABCза', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='ABC'))
def rex_text(body, regexp, flags=0, default=NULL): """ Search `regexp` expression in `body` text and then strip tags in found result. """ match = rex(body, regexp, flags=flags, default=default) try: return normalize_space(decode_entities(match.group(1))) except AttributeError: if default is NULL: raise DataNotFound('Regexp not found') else: return default
def rex_text(self, regexp, flags=0, byte=False, default=NULL): """ Search regular expression in response body and return content of first matching group. :param byte: if False then search is performed in `response.unicode_body()` else the rex is searched in `response.body`. """ try: match = self.rex_search(regexp, flags=flags, byte=byte) except DataNotFound: if default is NULL: raise DataNotFound('Regexp not found') else: return default else: return normalize_space(decode_entities(match.group(1)))
def test_normalize_space(self): self.assertEqual('', normalize_space(' ')) self.assertEqual('f', normalize_space(' f ')) self.assertEqual('f b', normalize_space(' f b ')) self.assertEqual(u'тр и гла за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ')) self.assertEqual( u'тр_и_гла_за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_')) self.assertEqual( u'трABCиABCглаABCза', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='ABC'))