def test_drop_node(self): HTML = """ <div><p>text<span>span</span><a href="#">link</a></p>tail</div>""" tree = fromstring(HTML) drop_node(tree, './/p') self.assertEqual(render_html(tree, encoding='utf-8'), b'<div>tail</div>') tree = fromstring(HTML) drop_node(tree, './/span', keep_content=True) self.assertEqual(render_html(tree, encoding='utf-8'), b'<div><p>textspan<a href="#">link</a></p>tail</div>')
def test_drop_node(self): HTML = """ <div><p>text<span>span</span><a href="#">link</a></p>tail</div>""" tree = fromstring(HTML) drop_node(tree, './/p') self.assertEqual(render_html(tree, encoding='utf-8'), b'<div>tail</div>') tree = fromstring(HTML) drop_node(tree, './/span', keep_content=True) self.assertEqual( render_html(tree, encoding='utf-8'), b'<div><p>textspan<a href="#">link</a></p>tail</div>')
def parse_search_results(grab, parse_index_size=False, strict_query=False): """ Parse google search results page content. """ #elif grab.search(u'please type the characters below'): if grab.response.code == 403: raise AccessDenied('Access denied (HTTP 403)') elif grab.search(u'src="/sorry/image'): # Captcha!!! raise CaptchaFound('Captcha found') elif grab.css_exists('#ires'): if strict_query and \ grab.search(u'Нет результатов для') or \ grab.search(u'No results found for'): pass logging.debug('Query modified') else: if len(grab.css_list('#ires h3')): # Something was found if parse_index_size: index_size = parse_index_size(grab) else: index_size = None # Yield found results results = [] for elem in grab.xpath_list('//*[h3[@class="r"]/a]'): title_elem = elem.xpath('h3/a')[0] # url url = title_elem.get('href') if url.startswith('/url?'): url = url.split('?q=')[1].split('&')[0] url = unquote_plus(url) # title title = get_node_text(title_elem) # snippet # Google could offer two type of snippet format: simple and extended # It depends on user agent # For <IE8, Opera, <FF3 you probably get simple format try: snippet_node = elem.xpath('div[@class="s"]')[0] except IndexError as ex: # Probably it is video or some other result # Such result type is not supported yet continue try: subnode = snippet_node.xpath('span[@class="st"]')[0] snippet = get_node_text(subnode, smart=False) extended_result = True except IndexError: drop_node(snippet_node, 'div') drop_node(snippet_node, 'span[@class="f"]') snippet = get_node_text(snippet_node, smart=False) extended_result = False # filetype try: filetype = elem.xpath('.//span[contains(@class, "xsm")]'\ '/text()')[0].lower().strip('[]') except IndexError: filetype = None #if 'File Format': if url: results.append({ 'url': url, 'title': title, 'snippet': snippet, 'filetype': filetype, 'index_size': index_size, 'extended': extended_result, }) return results else: pass #return [] elif grab.css_exists('#res'): # Could be search results here? # or just message "nothing was found"? pass else: raise ParsingError('Could not identify google page format')