Esempio n. 1
0
def scrap_out_thumbs(dom):
    """Scrap out thumbnail data from <script> tags.
    """
    ret_val = {}
    thumb_name = 'vidthumb'

    for script in eval_xpath_list(dom,
                                  '//script[contains(., "_setImagesSrc")]'):
        _script = script.text

        # var s='data:image/jpeg;base64, ...'
        _imgdata = _re("s='([^']*)").findall(_script)
        if not _imgdata:
            continue

        # var ii=['vidthumb4','vidthumb7']
        for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
            # At least the equal sign in the URL needs to be decoded
            ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")

    # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
    for script in eval_xpath_list(dom,
                                  '//script[contains(., "google.ldi={")]'):
        _script = script.text
        for key_val in _re(r'"%s\d+\":\"[^\"]*"' %
                           thumb_name).findall(_script):
            match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
            if match:
                # At least the equal sign in the URL needs to be decoded
                ret_val[match.group(1)] = match.group(2).replace(
                    r"\u003d", "=")

    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
    return ret_val
Esempio n. 2
0
def response(resp):
    resp_url = urlparse(resp.url)
    if resp_url.path.startswith('/verify'):
        raise SearxEngineAccessDeniedException()

    results = []

    dom = html.fromstring(resp.content.decode())
    for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'):
        dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None)
        if dot_data is None:
            title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
            results.append({
                'url': title_element.get('href'),
                'title': extract_text(title_element),
                'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)),
            })
        elif dot_data == '{"reporter_name":"hint/related/relates"}':
            suggestions_element = eval_xpath_getindex(result_element,
                                                      './div/div[@data-dot="main-box"]', 0, default=None)
            if suggestions_element is not None:
                for suggestion in eval_xpath_list(suggestions_element, './/ul/li'):
                    results.append({'suggestion': extract_text(suggestion)})

    return results
Esempio n. 3
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
        url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None)
        if url is None:
            continue
        url = parse_url(url)

        title = eval_xpath_getindex(result, './/h3/a', 0, default=None)
        if title is None:
            continue
        offset = len(extract_text(title.xpath('span')))
        title = extract_text(title)[offset:]

        content = eval_xpath_getindex(
            result, './/div[contains(@class, "compText")]', 0, default=''
        )
        if content:
            content = extract_text(content)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    return results
Esempio n. 4
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    vidthumb_imgdata = scrap_out_thumbs(dom)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
        url = eval_xpath_getindex(result, href_xpath, 0)
        c_node = eval_xpath_getindex(result, content_xpath, 0)

        # <img id="vidthumb1" ...>
        img_id = eval_xpath_getindex(c_node,
                                     './div[1]//a/g-img/img/@id',
                                     0,
                                     default=None)
        if img_id is None:
            continue
        img_src = vidthumb_imgdata.get(img_id, None)
        if not img_src:
            logger.error("no vidthumb imgdata for: %s" % img_id)
            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src',
                                          0)

        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))

        results.append({
            'url': url,
            'title': title,
            'content': content,
            'length': length,
            'author': pub_info,
            'thumbnail': img_src,
            'template': 'videos.html',
        })

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    return results
Esempio n. 5
0
    def test_eval_xpath_list(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        # check a not empty list
        self.assertEqual(utils.eval_xpath_list(doc, '//i/text()'), ['italic'])

        # check min_len parameter
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_list(doc, '//p', min_len=1)
        self.assertEqual(context.exception.message, 'len(xpath_str) < 1')
        self.assertEqual(context.exception.xpath_str, '//p')
Esempio n. 6
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(
            dom, '//table[contains(@class, "table-list")]/tbody//tr'):
        href = urljoin(
            url,
            eval_xpath_getindex(result,
                                './td[contains(@class, "name")]/a[2]/@href',
                                0))
        title = extract_text(
            eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
        seed = extract_text(
            eval_xpath(result, './/td[contains(@class, "seeds")]'))
        leech = extract_text(
            eval_xpath(result, './/td[contains(@class, "leeches")]'))
        filesize_info = extract_text(
            eval_xpath(result, './/td[contains(@class, "size")]/text()'))
        filesize, filesize_multiplier = filesize_info.split()
        filesize = get_torrent_size(filesize, filesize_multiplier)

        results.append({
            'url': href,
            'title': title,
            'seed': seed,
            'leech': leech,
            'filesize': filesize,
            'template': 'torrent.html'
        })

    return results
Esempio n. 7
0
def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []

    dom = html.fromstring(resp.text)

    number_of_results_element =\
        eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()',
                            0, default=None)
    if number_of_results_element is not None:
        number_of_results_string = re.sub('[^0-9]', '',
                                          number_of_results_element)
        results.append({'number_of_results': int(number_of_results_string)})

    for result in eval_xpath_list(dom,
                                  '//section[not(contains(@class, "essay"))]'):
        url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
        url = urljoin(base_url, url)
        title = eval_xpath(result, 'string(.//h2/a)').strip()
        content = extract_text(eval_xpath(result, './/p'))
        # append result
        results.append({'url': url, 'title': title, 'content': content})

    return results
Esempio n. 8
0
def response(resp):
    if resp.url.path.startswith('/verify'):
        raise SearxEngineAccessDeniedException()

    results = []

    dom = html.fromstring(resp.content.decode())
    for result_element in eval_xpath_list(dom,
                                          '//div[@data-dot="results"]/div'):
        result_data = eval_xpath_getindex(result_element,
                                          './/div[contains(@class, "bec586")]',
                                          0,
                                          default=None)
        if result_data is None:
            continue
        title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
        results.append({
            'url':
            title_element.get('href'),
            'title':
            extract_text(title_element),
            'content':
            extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')),
        })

    return results
Esempio n. 9
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, '//div[@class="g"]'):

        title = extract_text(eval_xpath(result, './/h3'))
        url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0)
        content = extract_text(eval_xpath(result, './/span[@class="st"]'))

        # get thumbnails
        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
        ids = result.xpath('.//div[@class="s"]//img/@id')
        if len(ids) > 0:
            thumbnails_data = \
                re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
                           script)
            tmp = []
            if len(thumbnails_data) != 0:
                tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
            thumbnail = ''
            if len(tmp) != 0:
                thumbnail = tmp[-1]

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'thumbnail': thumbnail,
                        'template': 'videos.html'})

    return results
Esempio n. 10
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):

        url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
        if url is None:
            continue
        url = parse_url(url)
        title = extract_text(result.xpath('.//h4/a'))
        content = extract_text(result.xpath('.//p'))
        img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)

        item = {
            'url': url,
            'title': title,
            'content': content,
            'img_src': img_src
        }

        pub_date = extract_text(
            result.xpath('.//span[contains(@class,"s-time")]'))
        ago = AGO_RE.search(pub_date)
        if ago:
            number = int(ago.group(1))
            delta = AGO_TIMEDELTA[ago.group(2)]
            pub_date = datetime.now() - delta * number
        else:
            try:
                pub_date = parser.parse(pub_date)
            except parser.ParserError:
                pub_date = None

        if pub_date is not None:
            item['publishedDate'] = pub_date
        results.append(item)

        for suggestion in eval_xpath_list(
                dom, '//div[contains(@class,"AlsoTry")]//td'):
            results.append({'suggestion': extract_text(suggestion)})

    return results
Esempio n. 11
0
def _fetch_supported_languages(resp):
    supported_languages = []
    dom = html.fromstring(resp.text)
    offset = len('lang_')

    for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'):
        supported_languages.append(val[offset:])

    return supported_languages
Esempio n. 12
0
def _fetch_supported_languages(resp):
    ret_val = {}
    dom = html.fromstring(resp.text)

    radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]')

    for x in radio_buttons:
        name = x.get("data-name")
        code = x.get("value").split('_')[-1]
        ret_val[code] = {"name": name}

    return ret_val
Esempio n. 13
0
def request(query, params):
    response_index = get(base_url, headers=params['headers'], raise_for_httperror=True)
    dom = html.fromstring(response_index.text)

    url_params = {'q': query}
    for e in eval_xpath_list(dom, '//input[@type="hidden"]'):
        name = e.get('name')
        value = e.get('value')
        url_params[name] = value

    params['url'] = base_url + '?' + urlencode(url_params)
    params['cookies'] = response_index.cookies
    return params
Esempio n. 14
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, results_xpath):

        single_result = {'template': template}

        for single_field in field_definition:
            single_field = {**default_field_settings, **single_field}
            try:
                if single_field['single_element']:
                    node = eval_xpath(result, single_field['xpath'])
                else:
                    node = eval_xpath_list(result, single_field['xpath'])

                if 'extract' in single_field and single_field[
                        'extract'] == 'url':
                    value = extract_url(node, search_url)
                elif 'extract' in single_field and single_field[
                        'extract'] == 'boolean':
                    value = (isinstance(node, list) and len(node) > 0)
                elif 'extract' in single_field and single_field[
                        'extract'] == 'boolean_negate':
                    value = (isinstance(node, list) and len(node) < 1)
                else:
                    value = extract_text(node)

                single_result[single_field['field_name']] = value
            except Exception as e:
                logger.warning('error in resolving field %s:\n%s',
                               single_field['field_name'], e)
                single_result[single_field['field_name']] = unresolvable_value

        results.append(single_result)

    return results
Esempio n. 15
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'):

        title = extract_text(eval_xpath(result, './h3[1]//a'))

        if not title:
            # this is a [ZITATION] block
            continue

        url = eval_xpath(result, './h3[1]//a/@href')[0]
        content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''

        pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
        if pub_info:
            content += "[%s]" % pub_info

        pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
        if pub_type:
            title = title + " " + pub_type

        results.append({
            'url':      url,
            'title':    title,
            'content':  content,
        })

    # parse suggestion
    for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
        results.append({'correction': extract_text(correction)})

    return results
Esempio n. 16
0
def response(resp):
    # get the base URL for the language in which request was made
    base_url = lang_urls["all"]["base"]

    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, xpath_results):
        link = eval_xpath_getindex(result, xpath_link, 0)
        href = urljoin(base_url, link.attrib.get("href"))
        title = extract_text(link)

        results.append({"url": href, "title": title})

    return results
Esempio n. 17
0
def response(resp):
    # get the base URL for the language in which request was made
    language = locale_to_lang_code(resp.search_params['language'])
    base_url = get_lang_urls(language)['base']

    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(dom, xpath_results):
        link = eval_xpath_getindex(result, xpath_link, 0)
        href = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)

        results.append({'url': href, 'title': title})

    return results
Esempio n. 18
0
def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    for entry in eval_xpath_list(dom, '//entry'):
        title = eval_xpath_getindex(entry, './/title', 0).text

        url = eval_xpath_getindex(entry, './/id', 0).text

        content_string = '{doi_content}{abstract_content}'

        abstract = eval_xpath_getindex(entry, './/summary', 0).text

        #  If a doi is available, add it to the snipppet
        doi_element = eval_xpath_getindex(entry,
                                          './/link[@title="doi"]',
                                          0,
                                          default=None)
        doi_content = doi_element.text if doi_element is not None else ''
        content = content_string.format(doi_content=doi_content,
                                        abstract_content=abstract)

        if len(content) > 300:
            content = content[0:300] + "..."
        # TODO: center snippet on query term

        publishedDate = datetime.strptime(
            eval_xpath_getindex(entry, './/published', 0).text,
            '%Y-%m-%dT%H:%M:%SZ')

        res_dict = {
            'url': url,
            'title': title,
            'publishedDate': publishedDate,
            'content': content
        }

        results.append(res_dict)

    return results
Esempio n. 19
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom,
            "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']"
    ):

        link = eval_xpath_getindex(result, './/h5/a', 0)

        url = base_url + link.attrib.get('href') + '#downloads'
        title = extract_text(link)
        img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0)
        res = {'url': url, 'title': title, 'img_src': img_src}

        results.append(res)

    return results
Esempio n. 20
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    for result in eval_xpath_list(dom, xpath_results):
        # defaults
        filesize = 0
        magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce"

        category = extract_text(
            eval_xpath_getindex(result, xpath_category, 0, default=[]))
        page_a = eval_xpath_getindex(result, xpath_title, 0)
        title = extract_text(page_a)
        href = base_url + page_a.attrib.get('href')

        magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5])

        filesize_info = eval_xpath_getindex(result,
                                            xpath_filesize,
                                            0,
                                            default=None)
        if filesize_info:
            try:
                filesize = filesize_info[:-2]
                filesize_multiplier = filesize_info[-2:]
                filesize = get_torrent_size(filesize, filesize_multiplier)
            except:
                pass
        # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime
        content = 'Category: "{category}".'
        content = content.format(category=category)

        results.append({
            'url': href,
            'title': title,
            'content': content,
            'filesize': filesize,
            'magnetlink': magnet_link,
            'template': 'torrent.html'
        })
    return results
Esempio n. 21
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath_list(
            dom,
            './/div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'
    ):

        link = eval_xpath_getindex(result, './/h5/a', 0)
        url = base_url + link.attrib.get('href') + '#downloads'
        title = extract_text(link)
        thumbnail_src = base_url\
            + eval_xpath_getindex(result, './/img', 0).attrib.get('src').replace('&w=32&h=32', '&w=64&h=64')

        res = {'url': url, 'title': title, 'thumbnail_src': thumbnail_src}

        # append result
        results.append(res)

    # return results
    return results
Esempio n. 22
0
def response(resp):
    results = []
    xmldom = etree.fromstring(resp.content)
    xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0)
    dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div')
    for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'):
        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(link)
        thumbnail_src = urljoin(
            gallery_url,
            eval_xpath_getindex(link, './/img', 0).attrib['src'])

        # append result
        results.append({
            'url': url,
            'title': title,
            'img_src': thumbnail_src,
            'content': '',
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    # return results
    return results
Esempio n. 23
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath_getindex(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):

        img_alt = eval_xpath_getindex(img_node, '@alt', 0)

        img_base64_id = eval_xpath(img_node, '@data-iid')
        if img_base64_id:
            img_base64_id = img_base64_id[0]
            thumbnail_src = img_bas64_map[img_base64_id]
        else:
            thumbnail_src = eval_xpath(img_node, '@src')
            if not thumbnail_src:
                thumbnail_src = eval_xpath(img_node, '@data-src')
            if thumbnail_src:
                thumbnail_src = thumbnail_src[0]
            else:
                thumbnail_src = ''

        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
        url = eval_xpath_getindex(link_node, '@href', 0)

        pub_nodes = eval_xpath(link_node, './div/div')
        pub_descr = img_alt
        pub_source = ''
        if pub_nodes:
            pub_descr = extract_text(pub_nodes[0])
            pub_source = extract_text(pub_nodes[1])

        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
        src_url = scrap_img_by_id(img_src_script, img_src_id)
        if not src_url:
            src_url = thumbnail_src

        results.append({
            'url': url,
            'title': img_alt,
            'content': pub_descr,
            'source': pub_source,
            'img_src': src_url,
            # 'img_format': img_format,
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    return results
Esempio n. 24
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):

        # The first <a> tag in the <article> contains the link to the
        # article The href attribute of the <a> is a google internal link,
        # we can't use.  The real link is hidden in the jslog attribute:
        #
        #   <a ...
        #      jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
        #      href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
        #      ... />

        jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
        url = re.findall('http[^;]*', jslog)
        if url:
            url = url[0]
        else:
            # The real URL is base64 encoded in the json attribute:
            # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
            jslog = jslog.split(";")[1].split(':')[1].strip()
            try:
                padding = (4 -(len(jslog) % 4)) * "="
                jslog = b64decode(jslog + padding)
            except binascii.Error:
                # URL cant be read, skip this result
                continue

            # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
            url = re.findall('http[^;"]*', str(jslog))[0]

        # the first <h3> tag in the <article> contains the title of the link
        title = extract_text(eval_xpath(result, './article/h3[1]'))

        # the first <div> tag in the <article> contains the content of the link
        content = extract_text(eval_xpath(result, './article/div[1]'))

        # the second <div> tag contains origin publisher and the publishing date

        pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
        pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))

        pub_info = []
        if pub_origin:
            pub_info.append(pub_origin)
        if pub_date:
            # The pub_date is mostly a string like 'yesertday', not a real
            # timezone date or time.  Therefore we can't use publishedDate.
            pub_info.append(pub_date)
        pub_info = ', '.join(pub_info)
        if pub_info:
            content = pub_info + ': ' + content

        # The image URL is located in a preceding sibling <img> tag, e.g.:
        # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
        # These URL are long but not personalized (double checked via tor).

        img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))

        results.append({
            'url':      url,
            'title':    title,
            'content':  content,
            'img_src':  img_src,
        })

    # return results
    return results
Esempio n. 25
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

        # results --> number_of_results
        try:
            _txt = eval_xpath_getindex(dom,
                                       '//div[@id="result-stats"]//text()', 0)
            _digit = ''.join([n for n in _txt if n.isdigit()])
            number_of_results = int(_digit)
            results.append({'number_of_results': number_of_results})
        except Exception as e:  # pylint: disable=broad-except
            logger.debug("did not 'number_of_results'")
            logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0)
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            results.append({'url': url, 'title': title, 'content': content})
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Esempio n. 26
0
def response(resp):
    '''Scrap *results* from the response (see :ref:`engine results`).

    '''
    results = []
    dom = html.fromstring(resp.text)
    is_onion = 'onions' in categories  # pylint: disable=undefined-variable

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):

            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1),
                              search_url)
            title = extract_text(
                eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(
                eval_xpath_list(result, content_xpath, min_len=1))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(
                    result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = (cached_url + extract_text(
                    eval_xpath_list(result, cached_xpath, min_len=1)))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)

    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath)),
                    map(extract_text, eval_xpath_list(dom, cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if suggestion_xpath:
        for suggestion in eval_xpath(dom, suggestion_xpath):
            results.append({'suggestion': extract_text(suggestion)})

    logger.debug("found %s results", len(results))
    return results
Esempio n. 27
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):
            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1),
                              search_url)
            title = extract_text(
                eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(
                eval_xpath_list(result, content_xpath, min_len=1))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(
                    result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url\
                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath)),
                    map(extract_text, eval_xpath_list(dom, cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results
Esempio n. 28
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
        answer_list = [_.xpath("normalize-space()") for _ in answer_list]
        results.append({'answer': ' '.join(answer_list)})
    else:
        logger.debug("did not find 'answer'")

        # results --> number_of_results
        if not use_mobile_ui:
            try:
                _txt = eval_xpath_getindex(
                    dom, '//div[@id="result-stats"]//text()', 0)
                _digit = ''.join([n for n in _txt if n.isdigit()])
                number_of_results = int(_digit)
                results.append({'number_of_results': number_of_results})
            except Exception as e:  # pylint: disable=broad-except
                logger.debug("did not 'number_of_results'")
                logger.error(e, exc_info=True)

    # parse results

    _results_xpath = results_xpath
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring item from the result_xpath list: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            if content is None:
                logger.debug(
                    'ingoring item from the result_xpath list: missing content of title "%s"',
                    title)
                continue

            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results