Esempio n. 1
0
def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []

    dom = html.fromstring(resp.text)

    number_of_results_element =\
        eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()',
                            0, default=None)
    if number_of_results_element is not None:
        number_of_results_string = re.sub('[^0-9]', '',
                                          number_of_results_element)
        results.append({'number_of_results': int(number_of_results_string)})

    for result in eval_xpath_list(dom,
                                  '//section[not(contains(@class, "essay"))]'):
        url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
        url = urljoin(base_url, url)
        title = eval_xpath(result, 'string(.//h2/a)').strip()
        content = extract_text(eval_xpath(result, './/p'))
        # append result
        results.append({'url': url, 'title': title, 'content': content})

    return results
Esempio n. 2
0
def response(resp):
    resp_url = urlparse(resp.url)
    if resp_url.path.startswith('/verify'):
        raise SearxEngineAccessDeniedException()

    results = []

    dom = html.fromstring(resp.content.decode())
    for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'):
        dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None)
        if dot_data is None:
            title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
            results.append({
                'url': title_element.get('href'),
                'title': extract_text(title_element),
                'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)),
            })
        elif dot_data == '{"reporter_name":"hint/related/relates"}':
            suggestions_element = eval_xpath_getindex(result_element,
                                                      './div/div[@data-dot="main-box"]', 0, default=None)
            if suggestions_element is not None:
                for suggestion in eval_xpath_list(suggestions_element, './/ul/li'):
                    results.append({'suggestion': extract_text(suggestion)})

    return results
Esempio n. 3
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
        url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None)
        if url is None:
            continue
        url = parse_url(url)

        title = eval_xpath_getindex(result, './/h3/a', 0, default=None)
        if title is None:
            continue
        offset = len(extract_text(title.xpath('span')))
        title = extract_text(title)[offset:]

        content = eval_xpath_getindex(
            result, './/div[contains(@class, "compText")]', 0, default=''
        )
        if content:
            content = extract_text(content)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    return results
Esempio n. 4
0
def response(resp):
    if resp.url.path.startswith('/verify'):
        raise SearxEngineAccessDeniedException()

    results = []

    dom = html.fromstring(resp.content.decode())
    for result_element in eval_xpath_list(dom,
                                          '//div[@data-dot="results"]/div'):
        result_data = eval_xpath_getindex(result_element,
                                          './/div[contains(@class, "bec586")]',
                                          0,
                                          default=None)
        if result_data is None:
            continue
        title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
        results.append({
            'url':
            title_element.get('href'),
            'title':
            extract_text(title_element),
            'content':
            extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')),
        })

    return results
Esempio n. 5
0
def response(resp):
    results = []

    rss = etree.fromstring(resp.content)

    ns = rss.nsmap

    # parse results
    for item in rss.xpath('./channel/item'):
        # url / title / content
        url = url_cleanup(
            eval_xpath_getindex(item, './link/text()', 0, default=None))
        title = eval_xpath_getindex(item, './title/text()', 0, default=url)
        content = eval_xpath_getindex(item,
                                      './description/text()',
                                      0,
                                      default='')

        # publishedDate
        publishedDate = eval_xpath_getindex(item,
                                            './pubDate/text()',
                                            0,
                                            default=None)
        try:
            publishedDate = parser.parse(publishedDate, dayfirst=False)
        except TypeError:
            publishedDate = datetime.now()
        except ValueError:
            publishedDate = datetime.now()

        # thumbnail
        thumbnail = eval_xpath_getindex(item,
                                        XPath('./News:Image/text()',
                                              namespaces=ns),
                                        0,
                                        default=None)
        if thumbnail is not None:
            thumbnail = image_url_cleanup(thumbnail)

        # append result
        if thumbnail is not None:
            results.append({
                'url': url,
                'title': title,
                'publishedDate': publishedDate,
                'content': content,
                'img_src': thumbnail
            })
        else:
            results.append({
                'url': url,
                'title': title,
                'publishedDate': publishedDate,
                'content': content
            })

    # return results
    return results
Esempio n. 6
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    vidthumb_imgdata = scrap_out_thumbs(dom)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
        url = eval_xpath_getindex(result, href_xpath, 0)
        c_node = eval_xpath_getindex(result, content_xpath, 0)

        # <img id="vidthumb1" ...>
        img_id = eval_xpath_getindex(c_node,
                                     './div[1]//a/g-img/img/@id',
                                     0,
                                     default=None)
        if img_id is None:
            continue
        img_src = vidthumb_imgdata.get(img_id, None)
        if not img_src:
            logger.error("no vidthumb imgdata for: %s" % img_id)
            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src',
                                          0)

        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))

        results.append({
            'url': url,
            'title': title,
            'content': content,
            'length': length,
            'author': pub_info,
            'thumbnail': img_src,
            'template': 'videos.html',
        })

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    return results
Esempio n. 7
0
def response(resp):

    headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
    get(url_ping, headers=headers_ping)

    if resp.status_code == 303:
        return []

    results = []
    doc = fromstring(resp.text)

    result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
    if not len(result_table) >= 3:
        # no more results
        return []
    result_table = result_table[2]

    tr_rows = eval_xpath(result_table, './/tr')

    # In the last <tr> is the form of the 'previous/next page' links
    tr_rows = tr_rows[:-1]

    len_tr_rows = len(tr_rows)
    offset = 0

    while len_tr_rows >= offset + 4:

        # assemble table rows we need to scrap
        tr_title = tr_rows[offset]
        tr_content = tr_rows[offset + 1]
        offset += 4

        # ignore sponsored Adds <tr class="result-sponsored">
        if tr_content.get('class') == 'result-sponsored':
            continue

        a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
        if a_tag is None:
            continue

        td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
        if td_content is None:
            continue

        results.append({
            'title': a_tag.text_content(),
            'content': extract_text(td_content),
            'url': a_tag.get('href'),
        })

    return results
Esempio n. 8
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, '//div[@class="g"]'):

        title = extract_text(eval_xpath(result, './/h3'))
        url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0)
        content = extract_text(eval_xpath(result, './/span[@class="st"]'))

        # get thumbnails
        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
        ids = result.xpath('.//div[@class="s"]//img/@id')
        if len(ids) > 0:
            thumbnails_data = \
                re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
                           script)
            tmp = []
            if len(thumbnails_data) != 0:
                tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
            thumbnail = ''
            if len(tmp) != 0:
                thumbnail = tmp[-1]

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'thumbnail': thumbnail,
                        'template': 'videos.html'})

    return results
Esempio n. 9
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(
            dom, '//table[contains(@class, "table-list")]/tbody//tr'):
        href = urljoin(
            url,
            eval_xpath_getindex(result,
                                './td[contains(@class, "name")]/a[2]/@href',
                                0))
        title = extract_text(
            eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
        seed = extract_text(
            eval_xpath(result, './/td[contains(@class, "seeds")]'))
        leech = extract_text(
            eval_xpath(result, './/td[contains(@class, "leeches")]'))
        filesize_info = extract_text(
            eval_xpath(result, './/td[contains(@class, "size")]/text()'))
        filesize, filesize_multiplier = filesize_info.split()
        filesize = get_torrent_size(filesize, filesize_multiplier)

        results.append({
            'url': href,
            'title': title,
            'seed': seed,
            'leech': leech,
            'filesize': filesize,
            'template': 'torrent.html'
        })

    return results
    def test_eval_xpath_getindex(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        # check index 0
        self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 0),
                         'italic')

        # default is 'something'
        self.assertEqual(
            utils.eval_xpath_getindex(doc,
                                      '//i/text()',
                                      1,
                                      default='something'), 'something')

        # default is None
        self.assertEqual(
            utils.eval_xpath_getindex(doc, '//i/text()', 1, default=None),
            None)

        # index not found
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_getindex(doc, '//i/text()', 1)
        self.assertEqual(context.exception.message, 'index 1 not found')

        # not a list
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_getindex(doc, 'count(//i)', 1)
        self.assertEqual(context.exception.message, 'the result is not a list')
Esempio n. 11
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):

        url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
        if url is None:
            continue
        url = parse_url(url)
        title = extract_text(result.xpath('.//h4/a'))
        content = extract_text(result.xpath('.//p'))
        img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)

        item = {
            'url': url,
            'title': title,
            'content': content,
            'img_src': img_src
        }

        pub_date = extract_text(
            result.xpath('.//span[contains(@class,"s-time")]'))
        ago = AGO_RE.search(pub_date)
        if ago:
            number = int(ago.group(1))
            delta = AGO_TIMEDELTA[ago.group(2)]
            pub_date = datetime.now() - delta * number
        else:
            try:
                pub_date = parser.parse(pub_date)
            except parser.ParserError:
                pub_date = None

        if pub_date is not None:
            item['publishedDate'] = pub_date
        results.append(item)

        for suggestion in eval_xpath_list(
                dom, '//div[contains(@class,"AlsoTry")]//td'):
            results.append({'suggestion': extract_text(suggestion)})

    return results
Esempio n. 12
0
def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    for entry in eval_xpath_list(dom, '//entry'):
        title = eval_xpath_getindex(entry, './/title', 0).text

        url = eval_xpath_getindex(entry, './/id', 0).text

        content_string = '{doi_content}{abstract_content}'

        abstract = eval_xpath_getindex(entry, './/summary', 0).text

        #  If a doi is available, add it to the snipppet
        doi_element = eval_xpath_getindex(entry,
                                          './/link[@title="doi"]',
                                          0,
                                          default=None)
        doi_content = doi_element.text if doi_element is not None else ''
        content = content_string.format(doi_content=doi_content,
                                        abstract_content=abstract)

        if len(content) > 300:
            content = content[0:300] + "..."
        # TODO: center snippet on query term

        publishedDate = datetime.strptime(
            eval_xpath_getindex(entry, './/published', 0).text,
            '%Y-%m-%dT%H:%M:%SZ')

        res_dict = {
            'url': url,
            'title': title,
            'publishedDate': publishedDate,
            'content': content
        }

        results.append(res_dict)

    return results
Esempio n. 13
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom,
            "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"
    ):

        link = eval_xpath_getindex(result, './/h5/a', 0)

        url = base_url + link.attrib.get('href') + '#downloads'
        title = extract_text(link)
        img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0)
        res = {'url': url, 'title': title, 'img_src': img_src}

        results.append(res)

    return results
Esempio n. 14
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    for result in eval_xpath_list(dom, xpath_results):
        # defaults
        filesize = 0
        magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce"

        category = extract_text(
            eval_xpath_getindex(result, xpath_category, 0, default=[]))
        page_a = eval_xpath_getindex(result, xpath_title, 0)
        title = extract_text(page_a)
        href = base_url + page_a.attrib.get('href')

        magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5])

        filesize_info = eval_xpath_getindex(result,
                                            xpath_filesize,
                                            0,
                                            default=None)
        if filesize_info:
            try:
                filesize = filesize_info[:-2]
                filesize_multiplier = filesize_info[-2:]
                filesize = get_torrent_size(filesize, filesize_multiplier)
            except:
                pass
        # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime
        content = 'Category: "{category}".'
        content = content.format(category=category)

        results.append({
            'url': href,
            'title': title,
            'content': content,
            'filesize': filesize,
            'magnetlink': magnet_link,
            'template': 'torrent.html'
        })
    return results
Esempio n. 15
0
def response(resp):
    results = []
    xmldom = etree.fromstring(resp.content)
    xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0)
    dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div')
    for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'):
        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)
        thumbnail_src = urljoin(
            gallery_url,
            eval_xpath_getindex(link, './/img', 0).attrib['src'])

        # append result
        results.append({
            'url': url,
            'title': title,
            'img_src': thumbnail_src,
            'content': '',
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    # return results
    return results
Esempio n. 16
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom,
            './/div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'
    ):

        link = eval_xpath_getindex(result, './/h5/a', 0)
        url = base_url + link.attrib.get('href') + '#downloads'
        title = extract_text(link)
        thumbnail_src = base_url\
            + eval_xpath_getindex(result, './/img', 0).attrib.get('src').replace('&w=32&h=32', '&w=64&h=64')

        res = {'url': url, 'title': title, 'thumbnail_src': thumbnail_src}

        # append result
        results.append(res)

    # return results
    return results
Esempio n. 17
0
def response(resp):
    # get the base URL for the language in which request was made
    base_url = lang_urls["all"]["base"]

    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, xpath_results):
        link = eval_xpath_getindex(result, xpath_link, 0)
        href = urljoin(base_url, link.attrib.get("href"))
        title = extract_text(link)

        results.append({"url": href, "title": title})

    return results
Esempio n. 18
0
def response(resp):
    # get the base URL for the language in which request was made
    language = locale_to_lang_code(resp.search_params['language'])
    base_url = get_lang_urls(language)['base']

    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, xpath_results):
        link = eval_xpath_getindex(result, xpath_link, 0)
        href = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)

        results.append({'url': href, 'title': title})

    return results
Esempio n. 19
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath_getindex(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):

        img_alt = eval_xpath_getindex(img_node, '@alt', 0)

        img_base64_id = eval_xpath(img_node, '@data-iid')
        if img_base64_id:
            img_base64_id = img_base64_id[0]
            thumbnail_src = img_bas64_map[img_base64_id]
        else:
            thumbnail_src = eval_xpath(img_node, '@src')
            if not thumbnail_src:
                thumbnail_src = eval_xpath(img_node, '@data-src')
            if thumbnail_src:
                thumbnail_src = thumbnail_src[0]
            else:
                thumbnail_src = ''

        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
        url = eval_xpath_getindex(link_node, '@href', 0)

        pub_nodes = eval_xpath(link_node, './div/div')
        pub_descr = img_alt
        pub_source = ''
        if pub_nodes:
            pub_descr = extract_text(pub_nodes[0])
            pub_source = extract_text(pub_nodes[1])

        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
        src_url = scrap_img_by_id(img_src_script, img_src_id)
        if not src_url:
            src_url = thumbnail_src

        results.append({
            'url': url,
            'title': img_alt,
            'content': pub_descr,
            'source': pub_source,
            'img_src': src_url,
            # 'img_format': img_format,
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    return results
Esempio n. 20
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):

        # The first <a> tag in the <article> contains the link to the
        # article The href attribute of the <a> is a google internal link,
        # we can't use.  The real link is hidden in the jslog attribute:
        #
        #   <a ...
        #      jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
        #      href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
        #      ... />

        jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
        url = re.findall('http[^;]*', jslog)
        if url:
            url = url[0]
        else:
            # The real URL is base64 encoded in the json attribute:
            # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
            jslog = jslog.split(";")[1].split(':')[1].strip()
            try:
                padding = (4 -(len(jslog) % 4)) * "="
                jslog = b64decode(jslog + padding)
            except binascii.Error:
                # URL cant be read, skip this result
                continue

            # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
            url = re.findall('http[^;"]*', str(jslog))[0]

        # the first <h3> tag in the <article> contains the title of the link
        title = extract_text(eval_xpath(result, './article/h3[1]'))

        # the first <div> tag in the <article> contains the content of the link
        content = extract_text(eval_xpath(result, './article/div[1]'))

        # the second <div> tag contains origin publisher and the publishing date

        pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
        pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))

        pub_info = []
        if pub_origin:
            pub_info.append(pub_origin)
        if pub_date:
            # The pub_date is mostly a string like 'yesertday', not a real
            # timezone date or time.  Therefore we can't use publishedDate.
            pub_info.append(pub_date)
        pub_info = ', '.join(pub_info)
        if pub_info:
            content = pub_info + ': ' + content

        # The image URL is located in a preceding sibling <img> tag, e.g.:
        # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
        # These URL are long but not personalized (double checked via tor).

        img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))

        results.append({
            'url':      url,
            'title':    title,
            'content':  content,
            'img_src':  img_src,
        })

    # return results
    return results
Esempio n. 21
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

        # results --> number_of_results
        try:
            _txt = eval_xpath_getindex(dom,
                                       '//div[@id="result-stats"]//text()', 0)
            _digit = ''.join([n for n in _txt if n.isdigit()])
            number_of_results = int(_digit)
            results.append({'number_of_results': number_of_results})
        except Exception as e:  # pylint: disable=broad-except
            logger.debug("did not 'number_of_results'")
            logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0)
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            results.append({'url': url, 'title': title, 'content': content})
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Esempio n. 22
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
        answer_list = [_.xpath("normalize-space()") for _ in answer_list]
        results.append({'answer': ' '.join(answer_list)})
    else:
        logger.debug("did not find 'answer'")

        # results --> number_of_results
        if not use_mobile_ui:
            try:
                _txt = eval_xpath_getindex(
                    dom, '//div[@id="result-stats"]//text()', 0)
                _digit = ''.join([n for n in _txt if n.isdigit()])
                number_of_results = int(_digit)
                results.append({'number_of_results': number_of_results})
            except Exception as e:  # pylint: disable=broad-except
                logger.debug("did not 'number_of_results'")
                logger.error(e, exc_info=True)

    # parse results

    _results_xpath = results_xpath
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring item from the result_xpath list: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            if content is None:
                logger.debug(
                    'ingoring item from the result_xpath list: missing content of title "%s"',
                    title)
                continue

            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results