Esempio n. 1
0
    def test_drop_node(self):
        HTML = """
            <div><p>text<span>span</span><a href="#">link</a></p>tail</div>"""
        tree = fromstring(HTML)
        drop_node(tree, './/p')
        self.assertEqual(render_html(tree, encoding='utf-8'),
                         b'<div>tail</div>')

        tree = fromstring(HTML)
        drop_node(tree, './/span', keep_content=True)
        self.assertEqual(render_html(tree, encoding='utf-8'),
                         b'<div><p>textspan<a href="#">link</a></p>tail</div>')
Esempio n. 2
0
    def test_drop_node(self):
        HTML = """
            <div><p>text<span>span</span><a href="#">link</a></p>tail</div>"""
        tree = fromstring(HTML)
        drop_node(tree, './/p')
        self.assertEqual(render_html(tree, encoding='utf-8'),
                         b'<div>tail</div>')

        tree = fromstring(HTML)
        drop_node(tree, './/span', keep_content=True)
        self.assertEqual(
            render_html(tree, encoding='utf-8'),
            b'<div><p>textspan<a href="#">link</a></p>tail</div>')
Esempio n. 3
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.response.code == 403:
        raise AccessDenied('Access denied (HTTP 403)')
    elif grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaFound('Captcha found')

    elif grab.css_exists('#ires'):
        if strict_query and \
                grab.search(u'Нет результатов для') or \
                grab.search(u'No results found for'):
            pass
            logging.debug('Query modified')
        else:
            if len(grab.css_list('#ires h3')):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath('h3/a')[0]

                    # url
                    url = title_elem.get('href')
                    if url.startswith('/url?'):
                        url = url.split('?q=')[1].split('&')[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, 'div')
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]'\
                                              '/text()')[0].lower().strip('[]')
                    except IndexError:
                        filetype = None

                    #if 'File Format':
                    if url:
                        results.append({
                            'url': url,
                            'title': title,
                            'snippet': snippet,
                            'filetype': filetype,
                            'index_size': index_size,
                            'extended': extended_result,
                        })
                return results
            else:
                pass
                #return []
    elif grab.css_exists('#res'):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError('Could not identify google page format')