Beispiel #1
0
    def test_drop_node(self):
        HTML = """
            <div><p>text<span>span</span><a href="#">link</a></p>tail</div>"""
        tree = fromstring(HTML)
        drop_node(tree, './/p')
        self.assertTrue(render_html(tree) == b'<div>tail</div>')

        tree = fromstring(HTML)
        drop_node(tree, './/span', keep_content=True)
        self.assertTrue(render_html(tree) == b'<div><p>textspan<a href="#">link</a></p>tail</div>')
Beispiel #2
0
    def test_drop_node(self):
        HTML = """
            <div><p>text<span>span</span><a href="#">link</a></p>tail</div>
        """
        tree = fromstring(HTML)
        drop_node(tree, './/p')
        self.assertTrue(render_html(tree) == '<div>tail</div>')

        tree = fromstring(HTML)
        drop_node(tree, './/span', keep_content=True)
        self.assertTrue(render_html(tree) == '<div><p>textspan<a href="#">link</a></p>tail</div>')
Beispiel #3
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.response.code == 403:
        raise AccessDenied('Access denied (HTTP 403)')
    elif grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaFound('Captcha found')

    elif grab.css_exists('#ires'):
        if strict_query and \
                grab.search(u'Нет результатов для') or \
                grab.search(u'No results found for'):
            pass
            logging.debug('Query modified')
        else:
            if len(grab.css_list('#ires h3')):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath('h3/a')[0]

                    # url
                    url = title_elem.get('href')
                    if url.startswith('/url?'):
                        url = url.split('?q=')[1].split('&')[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, 'div')
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]'\
                                              '/text()')[0].lower().strip('[]')
                    except IndexError:
                        filetype = None

                    #if 'File Format':
                    if url:
                        results.append({
                            'url': url,
                            'title': title,
                            'snippet': snippet,
                            'filetype': filetype,
                            'index_size': index_size,
                            'extended': extended_result,
                        })
                return results
            else:
                pass
                #return []
    elif grab.css_exists('#res'):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError('Could not identify google page format')
Beispiel #4
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.response.code == 403:
        raise AccessDenied('Access denied (HTTP 403)')
    elif grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaFound('Captcha found')

    elif grab.css_exists('#ires'):
        if strict_query and \
                grab.search(u'Нет результатов для') or \
                grab.search(u'No results found for'):
            pass
            logging.debug('Query modified')
        else:
            if len(grab.css_list('#ires h3')):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath('h3/a')[0]

                    # url
                    url = title_elem.get('href')
                    if url.startswith('/url?'):
                        url = url.split('?q=')[1].split('&')[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, 'div')
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]'\
                                              '/text()')[0].lower().strip('[]')
                    except IndexError:
                        filetype = None

                    #if 'File Format':
                    if url:
                        results.append({
                            'url': url,
                            'title': title,
                            'snippet': snippet,
                            'filetype': filetype,
                            'index_size': index_size,
                            'extended': extended_result,
                        })
                return results
            else:
                pass
                #return []
    elif grab.css_exists('#res'):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError('Could not identify google page format')
Beispiel #5
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    # elif grab.search(u'please type the characters below'):
    if grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaError("Captcha found")

    elif grab.css_exists("#ires"):
        if strict_query and (grab.search(u"Нет результатов для") or grab.search(u"No results found for")):
            pass
            logging.debug("Query modified")
        else:
            if len(grab.css_list("#ires h3")):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath("h3/a")[0]

                    # url
                    url = title_elem.get("href")
                    if url.startswith("/url?"):
                        url = url.split("?q=")[1].split("&")[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, "div")
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]' "/text()")[0].lower().strip("[]")
                    except IndexError:
                        filetype = None

                    # if 'File Format':
                    if url:
                        results.append(
                            {
                                "url": url,
                                "title": title,
                                "snippet": snippet,
                                "filetype": filetype,
                                "index_size": index_size,
                                "extended": extended_result,
                            }
                        )
                return results
            else:
                pass
                # return []
    elif grab.css_exists("#res"):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError("Could not identify google page format")