def _xpath(self, path, html, limit): """Recursively search HTML for content at XPath """ counter, separator, tag, index, attributes = path.pop(0) if counter == 0: self.num_searches += 1 results = [] if tag == '..': # parent raise common.WebScrapingError('.. not yet supported') results.append(self.get_parent(html)) elif tag == 'text()': # extract child text text = self._get_content(self._get_html(html)) results.append(common.remove_tags(text, keep_children=False)) # check if next tag is selecting attribute elif tag.startswith('@'): attr = tag[1:].lower() #parent = self.get_parent(context) value = self._get_attributes(html).get(attr, '') results.append(value) else: # have tag if counter > 0: # get child html when not at root html = self._get_content(html) # search direct children if / and all descendants if // search_fn = self._find_children if separator == '' else self._find_descendants matches = search_fn(html, tag) # support negative indices if index is not None and index < 0: matches = list(matches) index += len(matches) + 1 for child_i, child in enumerate(matches): # check if matches index if index is None or index == child_i + 1: # check if matches attributes if not attributes or self._match_attributes( attributes, self._get_attributes(child)): if path: results.extend(self._xpath(path[:], child, limit)) else: # final node results.append(self._get_content(child)) if len(results) > limit: break #if not children: # attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' # common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) return results
def _xpath(self, path, html, limit): """Recursively search HTML for content at XPath """ counter, separator, tag, index, attributes = path.pop(0) if counter == 0: self.num_searches += 1 results = [] if tag == '..': # parent raise common.WebScrapingError('.. not yet supported') results.append(self.get_parent(html)) elif tag == 'text()': # extract child text text = self._get_content(self._get_html(html)) results.append(common.remove_tags(text, keep_children=False)) # check if next tag is selecting attribute elif tag.startswith('@'): attr = tag[1:].lower() #parent = self.get_parent(context) value = self._get_attributes(html).get(attr, '') results.append(value) else: # have tag if counter > 0: # get child html when not at root html = self._get_content(html) # search direct children if / and all descendants if // search_fn = self._find_children if separator == '' else self._find_descendants matches = search_fn(html, tag) # support negative indices if index is not None and index < 0: matches = list(matches) index += len(matches) + 1 for child_i, child in enumerate(matches): # check if matches index if index is None or index == child_i + 1: # check if matches attributes if not attributes or self._match_attributes(attributes, self._get_attributes(child)): if path: results.extend(self._xpath(path[:], child, limit)) else: # final node results.append(self._get_content(child)) if len(results) > limit: break #if not children: # attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' # common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) return results
def get_excerpt(html, try_meta=False, max_chars=10000): """Extract excerpt from this HTML by finding largest text block try_meta indicates whether to try extracting from meta description tag max_chars is the maximum number of characters for the excerpt """ # try extracting meta description tag excerpt = '' if try_meta: excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content') if not excerpt: # remove these tags and then find biggest text block bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'footer' content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags)) if content: excerpt = max((len(p.strip()), p) for p in content.splitlines())[1] return common.unescape(excerpt.strip())[:max_chars]
def get_excerpt(html, try_meta=False, max_chars=255): """Extract excerpt from this HTML by finding largest text block try_meta indicates whether to try extracting from meta description tag max_chars is the maximum number of characters for the excerpt """ # try extracting meta description tag excerpt = '' if try_meta: excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content') if not excerpt: # remove these tags and then find biggest text block bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags)) if content: excerpt = max((len(p.strip()), p) for p in content.splitlines())[1] return common.unescape(excerpt.strip())[:max_chars]
def search(html, xpath, remove=None): """Query HTML document using XPath remove is a list of tags to ignore >>> search('<span>1</span><div>abc<a>LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>', '/div/a') ['LINK 1', 'LINK 3'] >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]/a[@class="link"]') ['LINK 1'] >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]//a') ['LINK 1', 'LINK 2'] >>> search('<div>abc<a class="link">LINK 1</a></div>', '/div/a/@class') ['link'] # test searching unicode >>> search(u'<a href="http://www.google.com" class="flink">google</a>', '//a[@class="flink"]') [u'google'] # test scraping a large amount of content len(search('<div><span>!</span></div>' * 10000, '//span')) 10000 """ orig_html = html html = clean_html(html, remove) contexts = [html] # initial context is entire webpage parent_attributes = [] for tag_i, (separator, tag, index, attributes) in enumerate(xpath_iter(xpath)): children = [] if tag == '..': # parent raise common.WebScrapingError('.. not yet supported') elif tag == 'text()': # extract child text for context in contexts: children.append(common.remove_tags(context, keep_children=False)) elif tag.startswith('@'): # selecting attribute name = tag[1:].lower() for a in parent_attributes: children.append(a.get(name, '')) else: # have tag parent_attributes = [] for context in contexts: # search direct children if / and all descendants if // matches = (separator == '' and find_children or find_descendants)(context, tag) # XXX change to iterator abs_index = index if abs_index is not None and abs_index < 0: # support negative indices abs_index += len(matches) + 1 for child_i, child in enumerate(matches): if index is None or abs_index == child_i + 1: # matches index if defined child_attributes = get_attributes(child) if match_attributes(attributes, child_attributes): # child matches tag and any defined indices or attributes children.append(get_content(child)) parent_attributes.append(child_attributes) if not children and tag == 'tbody': pass # skip tbody, which firefox includes in xpath when does not exist else: contexts = children if not contexts: attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) break return contexts
def search(html, xpath, remove=None): """Query HTML document using XPath remove is a list of tags to ignore >>> search('<span>1</span><div>abc<a>LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>', '/div/a') ['LINK 1', 'LINK 3'] >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]/a[@class="link"]') ['LINK 1'] >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]//a') ['LINK 1', 'LINK 2'] >>> search('<div>abc<a class="link">LINK 1</a></div>', '/div/a/@class') ['link'] # test searching unicode >>> search(u'<a href="http://www.google.com" class="flink">google</a>', '//a[@class="flink"]') [u'google'] # test scraping a large amount of content len(search('<div><span>!</span></div>' * 10000, '//span')) 10000 """ orig_html = html html = clean_html(html, remove) contexts = [html] # initial context is entire webpage parent_attributes = [] for tag_i, (separator, tag, index, attributes) in enumerate(xpath_iter(xpath)): children = [] if tag == '..': # parent raise common.WebScrapingError('.. not yet supported') elif tag == 'text()': # extract child text for context in contexts: children.append( common.remove_tags(context, keep_children=False)) elif tag.startswith('@'): # selecting attribute name = tag[1:].lower() for a in parent_attributes: children.append(a.get(name, '')) else: # have tag parent_attributes = [] for context in contexts: # search direct children if / and all descendants if // matches = (separator == '' and find_children or find_descendants)(context, tag) # XXX change to iterator abs_index = index if abs_index is not None and abs_index < 0: # support negative indices abs_index += len(matches) + 1 for child_i, child in enumerate(matches): if index is None or abs_index == child_i + 1: # matches index if defined child_attributes = get_attributes(child) if match_attributes(attributes, child_attributes): # child matches tag and any defined indices or attributes children.append(get_content(child)) parent_attributes.append(child_attributes) if not children and tag == 'tbody': pass # skip tbody, which firefox includes in xpath when does not exist else: contexts = children if not contexts: attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' common.logger.debug( 'No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) break return contexts