Ejemplo n.º 1
0
    def search(self, query, params, result_container, start_time,
               timeout_limit):
        try:
            search_results = self._search_basic(query, params)

            if search_results:
                result_container.extend(self.engine_name, search_results)

                engine_time = time() - start_time
                result_container.add_timing(self.engine_name, engine_time,
                                            engine_time)
                with threading.RLock():
                    self.engine.stats['engine_time'] += engine_time
                    self.engine.stats['engine_time_count'] += 1

        except ValueError as e:
            record_exception(self.engine_name, e)
            self._record_stats_on_error(result_container, start_time)
            logger.exception('engine {0} : invalid input : {1}'.format(
                self.engine_name, e))
        except Exception as e:
            record_exception(self.engine_name, e)
            self._record_stats_on_error(result_container, start_time)
            result_container.add_unresponsive_engine(self.engine_name,
                                                     'unexpected crash',
                                                     str(e))
            logger.exception('engine {0} : exception : {1}'.format(
                self.engine_name, e))
        else:
            if getattr(threading.current_thread(), '_timeout', False):
                record_error(self.engine_name, 'Timeout')
Ejemplo n.º 2
0
def search_one_offline_request_safe(engine_name, query, request_params,
                                    result_container, start_time,
                                    timeout_limit):
    engine = engines[engine_name]

    try:
        search_results = search_one_offline_request(engine, query,
                                                    request_params)

        if search_results:
            result_container.extend(engine_name, search_results)

            engine_time = time() - start_time
            result_container.add_timing(engine_name, engine_time, engine_time)
            with threading.RLock():
                engine.stats['engine_time'] += engine_time
                engine.stats['engine_time_count'] += 1

    except ValueError as e:
        record_exception(engine_name, e)
        record_offline_engine_stats_on_error(engine, result_container,
                                             start_time)
        logger.exception('engine {0} : invalid input : {1}'.format(
            engine_name, e))
    except Exception as e:
        record_exception(engine_name, e)
        record_offline_engine_stats_on_error(engine, result_container,
                                             start_time)
        result_container.add_unresponsive_engine(engine_name,
                                                 'unexpected crash', str(e))
        logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
    else:
        if getattr(threading.current_thread(), '_timeout', False):
            record_error(engine_name, 'Timeout')
Ejemplo n.º 3
0
def send_http_request(engine, request_params):
    # create dictionary which contain all
    # informations about the request
    request_args = dict(
        headers=request_params["headers"],
        cookies=request_params["cookies"],
        verify=request_params["verify"],
        auth=request_params["auth"],
    )

    # setting engine based proxies
    if hasattr(engine, "proxies"):
        request_args["proxies"] = requests_lib.get_proxies(engine.proxies)

    # max_redirects
    max_redirects = request_params.get("max_redirects")
    if max_redirects:
        request_args["max_redirects"] = max_redirects

    # soft_max_redirects
    soft_max_redirects = request_params.get("soft_max_redirects", max_redirects or 0)

    # raise_for_status
    request_args["raise_for_httperror"] = request_params.get(
        "raise_for_httperror", False
    )

    # specific type of request (GET or POST)
    if request_params["method"] == "GET":
        req = requests_lib.get
    else:
        req = requests_lib.post

    for key in request_params["data"].keys():
        try:
            if request_params["data"][key].keys():
                request_args["json"] = request_params["data"]
        except Exception as e:
            request_args["data"] = request_params["data"]

    # send the request
    response = req(request_params["url"], **request_args)

    # check soft limit of the redirect count
    if len(response.history) > soft_max_redirects:
        # unexpected redirect : record an error
        # but the engine might still return valid results.
        status_code = str(response.status_code or "")
        reason = response.reason or ""
        hostname = str(urlparse(response.url or "").netloc)
        record_error(
            engine.name,
            "{} redirects, maximum: {}".format(
                len(response.history), soft_max_redirects
            ),
            (status_code, reason, hostname),
        )

    return response
Ejemplo n.º 4
0
    def _send_http_request(self, params):
        # create dictionary which contain all
        # informations about the request
        request_args = dict(headers=params['headers'],
                            cookies=params['cookies'],
                            verify=params['verify'],
                            auth=params['auth'])

        # setting engine based proxies
        if hasattr(self.engine, 'proxies'):
            request_args['proxies'] = poolrequests.get_proxies(
                self.engine.proxies)

        # max_redirects
        max_redirects = params.get('max_redirects')
        if max_redirects:
            request_args['max_redirects'] = max_redirects

        # allow_redirects
        if 'allow_redirects' in params:
            request_args['allow_redirects'] = params['allow_redirects']

        # soft_max_redirects
        soft_max_redirects = params.get('soft_max_redirects', max_redirects
                                        or 0)

        # raise_for_status
        request_args['raise_for_httperror'] = params.get(
            'raise_for_httperror', True)

        # specific type of request (GET or POST)
        if params['method'] == 'GET':
            req = poolrequests.get
        else:
            req = poolrequests.post

        request_args['data'] = params['data']

        # send the request
        response = req(params['url'], **request_args)

        # check soft limit of the redirect count
        if len(response.history) > soft_max_redirects:
            # unexpected redirect : record an error
            # but the engine might still return valid results.
            status_code = str(response.status_code or '')
            reason = response.reason or ''
            hostname = str(urlparse(response.url or '').netloc)
            record_error(
                self.engine_name,
                '{} redirects, maximum: {}'.format(len(response.history),
                                                   soft_max_redirects),
                (status_code, reason, hostname))

        return response
Ejemplo n.º 5
0
    def _send_http_request(self, params):
        # create dictionary which contain all
        # informations about the request
        request_args = dict(
            headers=params['headers'],
            cookies=params['cookies'],
            verify=params['verify'],
            auth=params['auth']
        )

        # max_redirects
        max_redirects = params.get('max_redirects')
        if max_redirects:
            request_args['max_redirects'] = max_redirects

        # follow_redirects
        if 'follow_redirects' in params:
            # httpx has renamed this parameter to 'follow_redirects'
            request_args['follow_redirects'] = params['follow_redirects']

        # soft_max_redirects
        soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)

        # raise_for_status
        request_args['raise_for_httperror'] = params.get('raise_for_httperror', True)

        # specific type of request (GET or POST)
        if params['method'] == 'GET':
            req = searx.network.get
        else:
            req = searx.network.post

        request_args['data'] = params['data']
        # Have a longer timeout for searches
        request_args['timeout'] = 180

        # send the request
        response = req(params['url'], **request_args)

        # check soft limit of the redirect count
        if len(response.history) > soft_max_redirects:
            # unexpected redirect : record an error
            # but the engine might still return valid results.
            status_code = str(response.status_code or '')
            reason = response.reason_phrase or ''
            hostname = response.url.host
            record_error(self.engine_name,
                         '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
                         (status_code, reason, hostname))

        return response
Ejemplo n.º 6
0
    def extend(self, engine_name, results):
        standard_result_count = 0
        error_msgs = set()
        for result in list(results):
            result["engine"] = engine_name
            if "suggestion" in result:
                self.suggestions.add(result["suggestion"])
            elif "answer" in result:
                self.answers.append(result)
            elif "correction" in result:
                self.corrections.add(result["correction"])
            elif "infobox" in result:
                self._merge_infobox(result)
            elif "movie" in result:
                self.movie = result
            elif "movies" in result:
                self.movies = result["movies"]
            elif "number_of_results" in result:
                self._number_of_results.append(result["number_of_results"])
            else:
                # standard result (url, title, content)
                if "url" in result and not isinstance(result["url"], str):
                    logger.debug("result: invalid URL: %s", str(result))
                    error_msgs.add("invalid URL")
                elif "title" in result and not isinstance(result["title"], str):
                    logger.debug("result: invalid title: %s", str(result))
                    error_msgs.add("invalid title")
                elif "content" in result and not isinstance(result["content"], str):
                    logger.debug("result: invalid content: %s", str(result))
                    error_msgs.add("invalid content")
                else:
                    self._merge_result(result, standard_result_count + 1)
                    standard_result_count += 1

        if len(error_msgs) > 0:
            for msg in error_msgs:
                record_error(engine_name, "some results are invalids: " + msg)

        if engine_name in engines:
            with RLock():
                engines[engine_name].stats["search_count"] += 1
                engines[engine_name].stats["result_count"] += standard_result_count

        if (
            not self.paging
            and standard_result_count > 0
            and engine_name in engines
            and engines[engine_name].paging
        ):
            self.paging = True
Ejemplo n.º 7
0
    def extend(self, engine_name, results):
        standard_result_count = 0
        error_msgs = set()
        for result in list(results):
            result['engine'] = engine_name
            if 'suggestion' in result:
                self.suggestions.add(result['suggestion'])
            elif 'answer' in result:
                self.answers[result['answer']] = result
            elif 'correction' in result:
                self.corrections.add(result['correction'])
            elif 'infobox' in result:
                self._merge_infobox(result)
            elif 'number_of_results' in result:
                self._number_of_results.append(result['number_of_results'])
            elif 'engine_data' in result:
                self.engine_data[engine_name][
                    result['key']] = result['engine_data']
            else:
                # standard result (url, title, content)
                if 'url' in result and not isinstance(result['url'], str):
                    logger.debug('result: invalid URL: %s', str(result))
                    error_msgs.add('invalid URL')
                elif 'title' in result and not isinstance(
                        result['title'], str):
                    logger.debug('result: invalid title: %s', str(result))
                    error_msgs.add('invalid title')
                elif 'content' in result and not isinstance(
                        result['content'], str):
                    logger.debug('result: invalid content: %s', str(result))
                    error_msgs.add('invalid content')
                else:
                    self._merge_result(result, standard_result_count + 1)
                    standard_result_count += 1

        if len(error_msgs) > 0:
            for msg in error_msgs:
                record_error(engine_name, 'some results are invalids: ' + msg)

        if engine_name in engines:
            with RLock():
                engines[engine_name].stats['search_count'] += 1
                engines[engine_name].stats[
                    'result_count'] += standard_result_count

        if not self.paging and standard_result_count > 0 and engine_name in engines\
           and engines[engine_name].paging:
            self.paging = True
Ejemplo n.º 8
0
    def search(self, query, params, result_container, start_time,
               timeout_limit):
        # set timeout for all HTTP requests
        poolrequests.set_timeout_for_thread(timeout_limit,
                                            start_time=start_time)
        # reset the HTTP total time
        poolrequests.reset_time_for_thread()
        # enable HTTP only if explicitly enabled
        poolrequests.set_enable_http_protocol(self.engine.enable_http)

        # suppose everything will be alright
        requests_exception = False
        suspended_time = None

        try:
            # send requests and parse the results
            search_results = self._search_basic(query, params)

            # check if the engine accepted the request
            if search_results is not None:
                # yes, so add results
                result_container.extend(self.engine_name, search_results)

                # update engine time when there is no exception
                engine_time = time() - start_time
                page_load_time = poolrequests.get_time_for_thread()
                result_container.add_timing(self.engine_name, engine_time,
                                            page_load_time)
                with threading.RLock():
                    self.engine.stats['engine_time'] += engine_time
                    self.engine.stats['engine_time_count'] += 1
                    # update stats with the total HTTP time
                    self.engine.stats['page_load_time'] += page_load_time
                    self.engine.stats['page_load_count'] += 1
        except Exception as e:
            record_exception(self.engine_name, e)

            # Timing
            engine_time = time() - start_time
            page_load_time = poolrequests.get_time_for_thread()
            result_container.add_timing(self.engine_name, engine_time,
                                        page_load_time)

            # Record the errors
            with threading.RLock():
                self.engine.stats['errors'] += 1

            if (issubclass(e.__class__, requests.exceptions.Timeout)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'HTTP timeout')
                # requests timeout (connect or read)
                logger.error(
                    "engine {0} : HTTP requests timeout"
                    "(search duration : {1} s, timeout: {2} s) : {3}".format(
                        self.engine_name, engine_time, timeout_limit,
                        e.__class__.__name__))
                requests_exception = True
            elif (issubclass(e.__class__,
                             requests.exceptions.RequestException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'HTTP error')
                # other requests exception
                logger.exception(
                    "engine {0} : requests exception"
                    "(search duration : {1} s, timeout: {2} s) : {3}".format(
                        self.engine_name, engine_time, timeout_limit, e))
                requests_exception = True
            elif (issubclass(e.__class__, SearxEngineCaptchaException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'CAPTCHA required')
                logger.exception('engine {0} : CAPTCHA'.format(
                    self.engine_name))
                suspended_time = e.suspended_time  # pylint: disable=no-member
            elif (issubclass(e.__class__,
                             SearxEngineTooManyRequestsException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'too many requests')
                logger.exception('engine {0} : Too many requests'.format(
                    self.engine_name))
                suspended_time = e.suspended_time  # pylint: disable=no-member
            elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'blocked')
                logger.exception('engine {0} : Searx is blocked'.format(
                    self.engine_name))
                suspended_time = e.suspended_time  # pylint: disable=no-member
            else:
                result_container.add_unresponsive_engine(
                    self.engine_name, 'unexpected crash')
                # others errors
                logger.exception('engine {0} : exception : {1}'.format(
                    self.engine_name, e))
        else:
            if getattr(threading.current_thread(), '_timeout', False):
                record_error(self.engine_name, 'Timeout')

        # suspend the engine if there is an HTTP error
        # or suspended_time is defined
        with threading.RLock():
            if requests_exception or suspended_time:
                # update continuous_errors / suspend_end_time
                self.engine.continuous_errors += 1
                if suspended_time is None:
                    suspended_time = min(
                        settings['search']['max_ban_time_on_fail'],
                        self.engine.continuous_errors *
                        settings['search']['ban_time_on_fail'])
                self.engine.suspend_end_time = time() + suspended_time
            else:
                # reset the suspend variables
                self.engine.continuous_errors = 0
                self.engine.suspend_end_time = 0
Ejemplo n.º 9
0
def search_one_http_request_safe(engine_name, query, request_params,
                                 result_container, start_time, timeout_limit):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False

    try:
        # send requests and parse the results
        search_results = search_one_http_request(engine, query, request_params)

        # check if the engine accepted the request
        if search_results is not None:
            # yes, so add results
            result_container.extend(engine_name, search_results)

            # update engine time when there is no exception
            engine_time = time() - start_time
            page_load_time = requests_lib.get_time_for_thread()
            result_container.add_timing(engine_name, engine_time,
                                        page_load_time)
            with threading.RLock():
                engine.stats['engine_time'] += engine_time
                engine.stats['engine_time_count'] += 1
                # update stats with the total HTTP time
                engine.stats['page_load_time'] += page_load_time
                engine.stats['page_load_count'] += 1
    except Exception as e:
        record_exception(engine_name, e)

        # Timing
        engine_time = time() - start_time
        page_load_time = requests_lib.get_time_for_thread()
        result_container.add_timing(engine_name, engine_time, page_load_time)

        # Record the errors
        with threading.RLock():
            engine.stats['errors'] += 1

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine(engine_name,
                                                     'HTTP timeout')
            # requests timeout (connect or read)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit,
                    e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine(engine_name, 'HTTP error')
            # other requests exception
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit, e))
            requests_exception = True
        elif (issubclass(e.__class__, SearxEngineCaptchaException)):
            result_container.add_unresponsive_engine(engine_name,
                                                     'CAPTCHA required')
            logger.exception('engine {0} : CAPTCHA')
        else:
            result_container.add_unresponsive_engine(engine_name,
                                                     'unexpected crash')
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(
                engine_name, e))
    else:
        if getattr(threading.current_thread(), '_timeout', False):
            record_error(engine_name, 'Timeout')

    # suspend or not the engine if there are HTTP errors
    with threading.RLock():
        if requests_exception:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(
                settings['search']['max_ban_time_on_fail'],
                engine.continuous_errors *
                settings['search']['ban_time_on_fail'])
        else:
            # no HTTP error (perhaps an engine error)
            # anyway, reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
Ejemplo n.º 10
0
def search_one_http_request_safe(
    engine_name, query, request_params, result_container, start_time, timeout_limit
):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False
    suspended_time = None

    try:
        # send requests and parse the results
        search_results = search_one_http_request(engine, query, request_params)

        # check if the engine accepted the request
        if search_results is not None:
            # yes, so add results
            result_container.extend(engine_name, search_results)

            # update engine time when there is no exception
            engine_time = time() - start_time
            page_load_time = requests_lib.get_time_for_thread()
            result_container.add_timing(engine_name, engine_time, page_load_time)
            with threading.RLock():
                engine.stats["engine_time"] += engine_time
                engine.stats["engine_time_count"] += 1
                # update stats with the total HTTP time
                engine.stats["page_load_time"] += page_load_time
                engine.stats["page_load_count"] += 1
    except Exception as e:
        record_exception(engine_name, e)

        # Timing
        engine_time = time() - start_time
        page_load_time = requests_lib.get_time_for_thread()
        result_container.add_timing(engine_name, engine_time, page_load_time)

        # Record the errors
        with threading.RLock():
            engine.stats["errors"] += 1

        if issubclass(e.__class__, requests.exceptions.Timeout):
            result_container.add_unresponsive_engine(engine_name, "HTTP timeout")
            # requests timeout (connect or read)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit, e.__class__.__name__
                )
            )
            requests_exception = True
        elif issubclass(e.__class__, requests.exceptions.RequestException):
            result_container.add_unresponsive_engine(engine_name, "HTTP error")
            # other requests exception
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit, e
                )
            )
            requests_exception = True
        elif issubclass(e.__class__, SearxEngineCaptchaException):
            result_container.add_unresponsive_engine(engine_name, "CAPTCHA required")
            logger.exception("engine {0} : CAPTCHA")
            suspended_time = e.suspended_time  # pylint: disable=no-member
        elif issubclass(e.__class__, SearxEngineTooManyRequestsException):
            result_container.add_unresponsive_engine(engine_name, "too many requests")
            logger.exception("engine {0} : Too many requests")
            suspended_time = e.suspended_time  # pylint: disable=no-member
        elif issubclass(e.__class__, SearxEngineAccessDeniedException):
            result_container.add_unresponsive_engine(engine_name, "blocked")
            logger.exception("engine {0} : Searx is blocked")
            suspended_time = e.suspended_time  # pylint: disable=no-member
        else:
            result_container.add_unresponsive_engine(engine_name, "unexpected crash")
            # others errors
            logger.exception("engine {0} : exception : {1}".format(engine_name, e))
    else:
        if getattr(threading.current_thread(), "_timeout", False):
            record_error(engine_name, "Timeout")

    # suspend the engine if there is an HTTP error
    # or suspended_time is defined
    with threading.RLock():
        if requests_exception or suspended_time:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            if suspended_time is None:
                suspended_time = min(
                    settings["search"]["max_ban_time_on_fail"],
                    engine.continuous_errors * settings["search"]["ban_time_on_fail"],
                )
            engine.suspend_end_time = time() + suspended_time
        else:
            # reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0