Beispiel #1
0
def raise_for_cloudflare_captcha(resp):
    if resp.headers.get('Server', '').startswith('cloudflare'):
        if is_cloudflare_challenge(resp):
            # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
            # suspend for 2 weeks
            raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA',
                                              suspended_time=3600 * 24 * 15)

        if is_cloudflare_firewall(resp):
            raise SearxEngineAccessDeniedException(
                message='Cloudflare Firewall', suspended_time=3600 * 24)
Beispiel #2
0
def response(resp):
    resp_url = urlparse(resp.url)
    if resp_url.path.startswith('/showcaptcha'):
        raise SearxEngineCaptchaException()

    dom = html.fromstring(resp.text)
    results = []

    for result in dom.xpath(results_xpath):
        try:
            res = {'url': result.xpath(url_xpath)[0],
                   'title': ''.join(result.xpath(title_xpath)),
                   'content': ''.join(result.xpath(content_xpath))}
        except:
            logger.exception('yandex parse crash')
            continue

        results.append(res)

    return results
Beispiel #3
0
def response(resp):
    resp_url = urlparse(resp.url)
    if resp_url.path.startswith('/nocaptcha'):
        raise SearxEngineCaptchaException()

    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        link = result.xpath(link_xpath)[0]
        href = urljoin(url, link.attrib.get('href'))
        title = extract_text(link)
        content = extract_text(result.xpath(content_xpath))

        # append result
        results.append({'url': href, 'title': title, 'content': content})

    # return results
    return results
Beispiel #4
0
def response(resp):
    results = []

    # According to https://www.qwant.com/js/app.js
    if resp.status_code == 429:
        raise SearxEngineCaptchaException()

    # raise for other errors
    raise_for_httperror(resp)

    # load JSON result
    search_results = loads(resp.text)

    # check for an API error
    if search_results.get('status') != 'success':
        raise SearxEngineAPIException('API error ' +
                                      str(search_results.get('error', '')))

    # return empty array if there are no results
    if 'data' not in search_results:
        return []

    data = search_results.get('data', {})

    res = data.get('result', {})

    # parse results
    for result in res.get('items', {}):

        title = html_to_text(result['title'])
        res_url = result['url']
        content = html_to_text(result['desc'])

        if category_to_keyword.get(categories[0], '') == 'web':
            results.append({
                'title': title,
                'content': content,
                'url': res_url
            })

        elif category_to_keyword.get(categories[0], '') == 'images':
            thumbnail_src = result['thumbnail']
            img_src = result['media']
            results.append({
                'template': 'images.html',
                'url': res_url,
                'title': title,
                'content': '',
                'thumbnail_src': thumbnail_src,
                'img_src': img_src
            })

        elif category_to_keyword.get(categories[0], '') == 'news':
            published_date = datetime.fromtimestamp(result['date'], None)
            media = result.get('media', [])
            if len(media) > 0:
                img_src = media[0].get('pict', {}).get('url', None)
            else:
                img_src = None
            results.append({
                'url': res_url,
                'title': title,
                'publishedDate': published_date,
                'content': content,
                'img_src': img_src
            })

    return results
Beispiel #5
0
def raise_for_recaptcha(resp):
    if resp.status_code == 503 \
       and '"https://www.google.com/recaptcha/' in resp.text:
        raise SearxEngineCaptchaException(message='ReCAPTCHA',
                                          suspended_time=3600 * 24 * 7)
Beispiel #6
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect":
        raise SearxEngineCaptchaException()

    if resp_url.path.startswith("/sorry"):
        raise SearxEngineCaptchaException()

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, "@alt")[0]

            img_base64_id = eval_xpath(img_node, "@data-iid")
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, "@src")
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, "@data-src")
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ""

            link_node = eval_xpath(img_node, "../../../a[2]")[0]
            url = eval_xpath(link_node, "@href")[0]

            pub_nodes = eval_xpath(link_node, "./div/div")
            pub_descr = img_alt
            pub_source = ""
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            img_src_id = eval_xpath(img_node, "../../../@data-id")[0]
            src_url = scrap_img_by_id(img_src_script, img_src_id)
            if not src_url:
                src_url = thumbnail_src

            results.append({
                "url": url,
                "title": img_alt,
                "content": pub_descr,
                "source": pub_source,
                "img_src": src_url,
                "img_format": {
                    "width": int(eval_xpath(img_node, "@width")[0]),
                    "height": int(eval_xpath(img_node, "@height")[0]),
                },
                "thumbnail_src": thumbnail_src,
                "template": "images.html",
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
Beispiel #7
0
def detect_google_sorry(resp):
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith(
            '/sorry'):
        raise SearxEngineCaptchaException()
Beispiel #8
0
def raise_captcha(resp):

    if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
        # suspend CAPTCHA for 7 days
        raise SearxEngineCaptchaException(suspended_time=7 * 24 * 3600)
Beispiel #9
0
def detect_google_sorry(resp):
    if resp.url.host == 'sorry.google.com' or resp.url.path.startswith(
            '/sorry'):
        raise SearxEngineCaptchaException()
Beispiel #10
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise SearxEngineCaptchaException()

    if resp_url.path.startswith('/sorry'):
        raise SearxEngineCaptchaException()

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

    # results --> number_of_results
        try:
            _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
            _digit = ''.join([n for n in _txt if n.isdigit()])
            number_of_results = int(_digit)
            results.append({'number_of_results': number_of_results})
        except Exception as e:  # pylint: disable=broad-except
            logger.debug("did not 'number_of_results'")
            logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug('ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0)
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            results.append({
                'url': url,
                'title': title,
                'content': content
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results