def search(self, query, params, result_container, start_time, timeout_limit): try: search_results = self._search_basic(query, params) if search_results: result_container.extend(self.engine_name, search_results) engine_time = time() - start_time result_container.add_timing(self.engine_name, engine_time, engine_time) with threading.RLock(): self.engine.stats['engine_time'] += engine_time self.engine.stats['engine_time_count'] += 1 except ValueError as e: record_exception(self.engine_name, e) self._record_stats_on_error(result_container, start_time) logger.exception('engine {0} : invalid input : {1}'.format( self.engine_name, e)) except Exception as e: record_exception(self.engine_name, e) self._record_stats_on_error(result_container, start_time) result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash', str(e)) logger.exception('engine {0} : exception : {1}'.format( self.engine_name, e)) else: if getattr(threading.current_thread(), '_timeout', False): record_error(self.engine_name, 'Timeout')
def search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): engine = engines[engine_name] try: search_results = search_one_offline_request(engine, query, request_params) if search_results: result_container.extend(engine_name, search_results) engine_time = time() - start_time result_container.add_timing(engine_name, engine_time, engine_time) with threading.RLock(): engine.stats['engine_time'] += engine_time engine.stats['engine_time_count'] += 1 except ValueError as e: record_exception(engine_name, e) record_offline_engine_stats_on_error(engine, result_container, start_time) logger.exception('engine {0} : invalid input : {1}'.format( engine_name, e)) except Exception as e: record_exception(engine_name, e) record_offline_engine_stats_on_error(engine, result_container, start_time) result_container.add_unresponsive_engine(engine_name, 'unexpected crash', str(e)) logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) else: if getattr(threading.current_thread(), '_timeout', False): record_error(engine_name, 'Timeout')
def send_http_request(engine, request_params): # create dictionary which contain all # informations about the request request_args = dict( headers=request_params["headers"], cookies=request_params["cookies"], verify=request_params["verify"], auth=request_params["auth"], ) # setting engine based proxies if hasattr(engine, "proxies"): request_args["proxies"] = requests_lib.get_proxies(engine.proxies) # max_redirects max_redirects = request_params.get("max_redirects") if max_redirects: request_args["max_redirects"] = max_redirects # soft_max_redirects soft_max_redirects = request_params.get("soft_max_redirects", max_redirects or 0) # raise_for_status request_args["raise_for_httperror"] = request_params.get( "raise_for_httperror", False ) # specific type of request (GET or POST) if request_params["method"] == "GET": req = requests_lib.get else: req = requests_lib.post for key in request_params["data"].keys(): try: if request_params["data"][key].keys(): request_args["json"] = request_params["data"] except Exception as e: request_args["data"] = request_params["data"] # send the request response = req(request_params["url"], **request_args) # check soft limit of the redirect count if len(response.history) > soft_max_redirects: # unexpected redirect : record an error # but the engine might still return valid results. status_code = str(response.status_code or "") reason = response.reason or "" hostname = str(urlparse(response.url or "").netloc) record_error( engine.name, "{} redirects, maximum: {}".format( len(response.history), soft_max_redirects ), (status_code, reason, hostname), ) return response
def _send_http_request(self, params): # create dictionary which contain all # informations about the request request_args = dict(headers=params['headers'], cookies=params['cookies'], verify=params['verify'], auth=params['auth']) # setting engine based proxies if hasattr(self.engine, 'proxies'): request_args['proxies'] = poolrequests.get_proxies( self.engine.proxies) # max_redirects max_redirects = params.get('max_redirects') if max_redirects: request_args['max_redirects'] = max_redirects # allow_redirects if 'allow_redirects' in params: request_args['allow_redirects'] = params['allow_redirects'] # soft_max_redirects soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0) # raise_for_status request_args['raise_for_httperror'] = params.get( 'raise_for_httperror', True) # specific type of request (GET or POST) if params['method'] == 'GET': req = poolrequests.get else: req = poolrequests.post request_args['data'] = params['data'] # send the request response = req(params['url'], **request_args) # check soft limit of the redirect count if len(response.history) > soft_max_redirects: # unexpected redirect : record an error # but the engine might still return valid results. status_code = str(response.status_code or '') reason = response.reason or '' hostname = str(urlparse(response.url or '').netloc) record_error( self.engine_name, '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), (status_code, reason, hostname)) return response
def _send_http_request(self, params): # create dictionary which contain all # informations about the request request_args = dict( headers=params['headers'], cookies=params['cookies'], verify=params['verify'], auth=params['auth'] ) # max_redirects max_redirects = params.get('max_redirects') if max_redirects: request_args['max_redirects'] = max_redirects # follow_redirects if 'follow_redirects' in params: # httpx has renamed this parameter to 'follow_redirects' request_args['follow_redirects'] = params['follow_redirects'] # soft_max_redirects soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0) # raise_for_status request_args['raise_for_httperror'] = params.get('raise_for_httperror', True) # specific type of request (GET or POST) if params['method'] == 'GET': req = searx.network.get else: req = searx.network.post request_args['data'] = params['data'] # Have a longer timeout for searches request_args['timeout'] = 180 # send the request response = req(params['url'], **request_args) # check soft limit of the redirect count if len(response.history) > soft_max_redirects: # unexpected redirect : record an error # but the engine might still return valid results. status_code = str(response.status_code or '') reason = response.reason_phrase or '' hostname = response.url.host record_error(self.engine_name, '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), (status_code, reason, hostname)) return response
def extend(self, engine_name, results): standard_result_count = 0 error_msgs = set() for result in list(results): result["engine"] = engine_name if "suggestion" in result: self.suggestions.add(result["suggestion"]) elif "answer" in result: self.answers.append(result) elif "correction" in result: self.corrections.add(result["correction"]) elif "infobox" in result: self._merge_infobox(result) elif "movie" in result: self.movie = result elif "movies" in result: self.movies = result["movies"] elif "number_of_results" in result: self._number_of_results.append(result["number_of_results"]) else: # standard result (url, title, content) if "url" in result and not isinstance(result["url"], str): logger.debug("result: invalid URL: %s", str(result)) error_msgs.add("invalid URL") elif "title" in result and not isinstance(result["title"], str): logger.debug("result: invalid title: %s", str(result)) error_msgs.add("invalid title") elif "content" in result and not isinstance(result["content"], str): logger.debug("result: invalid content: %s", str(result)) error_msgs.add("invalid content") else: self._merge_result(result, standard_result_count + 1) standard_result_count += 1 if len(error_msgs) > 0: for msg in error_msgs: record_error(engine_name, "some results are invalids: " + msg) if engine_name in engines: with RLock(): engines[engine_name].stats["search_count"] += 1 engines[engine_name].stats["result_count"] += standard_result_count if ( not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging ): self.paging = True
def extend(self, engine_name, results): standard_result_count = 0 error_msgs = set() for result in list(results): result['engine'] = engine_name if 'suggestion' in result: self.suggestions.add(result['suggestion']) elif 'answer' in result: self.answers[result['answer']] = result elif 'correction' in result: self.corrections.add(result['correction']) elif 'infobox' in result: self._merge_infobox(result) elif 'number_of_results' in result: self._number_of_results.append(result['number_of_results']) elif 'engine_data' in result: self.engine_data[engine_name][ result['key']] = result['engine_data'] else: # standard result (url, title, content) if 'url' in result and not isinstance(result['url'], str): logger.debug('result: invalid URL: %s', str(result)) error_msgs.add('invalid URL') elif 'title' in result and not isinstance( result['title'], str): logger.debug('result: invalid title: %s', str(result)) error_msgs.add('invalid title') elif 'content' in result and not isinstance( result['content'], str): logger.debug('result: invalid content: %s', str(result)) error_msgs.add('invalid content') else: self._merge_result(result, standard_result_count + 1) standard_result_count += 1 if len(error_msgs) > 0: for msg in error_msgs: record_error(engine_name, 'some results are invalids: ' + msg) if engine_name in engines: with RLock(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats[ 'result_count'] += standard_result_count if not self.paging and standard_result_count > 0 and engine_name in engines\ and engines[engine_name].paging: self.paging = True
def search(self, query, params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time poolrequests.reset_time_for_thread() # enable HTTP only if explicitly enabled poolrequests.set_enable_http_protocol(self.engine.enable_http) # suppose everything will be alright requests_exception = False suspended_time = None try: # send requests and parse the results search_results = self._search_basic(query, params) # check if the engine accepted the request if search_results is not None: # yes, so add results result_container.extend(self.engine_name, search_results) # update engine time when there is no exception engine_time = time() - start_time page_load_time = poolrequests.get_time_for_thread() result_container.add_timing(self.engine_name, engine_time, page_load_time) with threading.RLock(): self.engine.stats['engine_time'] += engine_time self.engine.stats['engine_time_count'] += 1 # update stats with the total HTTP time self.engine.stats['page_load_time'] += page_load_time self.engine.stats['page_load_count'] += 1 except Exception as e: record_exception(self.engine_name, e) # Timing engine_time = time() - start_time page_load_time = poolrequests.get_time_for_thread() result_container.add_timing(self.engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): self.engine.stats['errors'] += 1 if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine( self.engine_name, 'HTTP timeout') # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( self.engine_name, engine_time, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine( self.engine_name, 'HTTP error') # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( self.engine_name, engine_time, timeout_limit, e)) requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine( self.engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA'.format( self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): result_container.add_unresponsive_engine( self.engine_name, 'too many requests') logger.exception('engine {0} : Too many requests'.format( self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): result_container.add_unresponsive_engine( self.engine_name, 'blocked') logger.exception('engine {0} : Searx is blocked'.format( self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine( self.engine_name, 'unexpected crash') # others errors logger.exception('engine {0} : exception : {1}'.format( self.engine_name, e)) else: if getattr(threading.current_thread(), '_timeout', False): record_error(self.engine_name, 'Timeout') # suspend the engine if there is an HTTP error # or suspended_time is defined with threading.RLock(): if requests_exception or suspended_time: # update continuous_errors / suspend_end_time self.engine.continuous_errors += 1 if suspended_time is None: suspended_time = min( settings['search']['max_ban_time_on_fail'], self.engine.continuous_errors * settings['search']['ban_time_on_fail']) self.engine.suspend_end_time = time() + suspended_time else: # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0
def search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time requests_lib.reset_time_for_thread() # engine = engines[engine_name] # suppose everything will be alright requests_exception = False try: # send requests and parse the results search_results = search_one_http_request(engine, query, request_params) # check if the engine accepted the request if search_results is not None: # yes, so add results result_container.extend(engine_name, search_results) # update engine time when there is no exception engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) with threading.RLock(): engine.stats['engine_time'] += engine_time engine.stats['engine_time_count'] += 1 # update stats with the total HTTP time engine.stats['page_load_time'] += page_load_time engine.stats['page_load_count'] += 1 except Exception as e: record_exception(engine_name, e) # Timing engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): engine.stats['errors'] += 1 if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine(engine_name, 'HTTP timeout') # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine(engine_name, 'HTTP error') # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e)) requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA') else: result_container.add_unresponsive_engine(engine_name, 'unexpected crash') # others errors logger.exception('engine {0} : exception : {1}'.format( engine_name, e)) else: if getattr(threading.current_thread(), '_timeout', False): record_error(engine_name, 'Timeout') # suspend or not the engine if there are HTTP errors with threading.RLock(): if requests_exception: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 engine.suspend_end_time = time() + min( settings['search']['max_ban_time_on_fail'], engine.continuous_errors * settings['search']['ban_time_on_fail']) else: # no HTTP error (perhaps an engine error) # anyway, reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0
def search_one_http_request_safe( engine_name, query, request_params, result_container, start_time, timeout_limit ): # set timeout for all HTTP requests requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time requests_lib.reset_time_for_thread() # engine = engines[engine_name] # suppose everything will be alright requests_exception = False suspended_time = None try: # send requests and parse the results search_results = search_one_http_request(engine, query, request_params) # check if the engine accepted the request if search_results is not None: # yes, so add results result_container.extend(engine_name, search_results) # update engine time when there is no exception engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) with threading.RLock(): engine.stats["engine_time"] += engine_time engine.stats["engine_time_count"] += 1 # update stats with the total HTTP time engine.stats["page_load_time"] += page_load_time engine.stats["page_load_count"] += 1 except Exception as e: record_exception(engine_name, e) # Timing engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): engine.stats["errors"] += 1 if issubclass(e.__class__, requests.exceptions.Timeout): result_container.add_unresponsive_engine(engine_name, "HTTP timeout") # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e.__class__.__name__ ) ) requests_exception = True elif issubclass(e.__class__, requests.exceptions.RequestException): result_container.add_unresponsive_engine(engine_name, "HTTP error") # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e ) ) requests_exception = True elif issubclass(e.__class__, SearxEngineCaptchaException): result_container.add_unresponsive_engine(engine_name, "CAPTCHA required") logger.exception("engine {0} : CAPTCHA") suspended_time = e.suspended_time # pylint: disable=no-member elif issubclass(e.__class__, SearxEngineTooManyRequestsException): result_container.add_unresponsive_engine(engine_name, "too many requests") logger.exception("engine {0} : Too many requests") suspended_time = e.suspended_time # pylint: disable=no-member elif issubclass(e.__class__, SearxEngineAccessDeniedException): result_container.add_unresponsive_engine(engine_name, "blocked") logger.exception("engine {0} : Searx is blocked") suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine(engine_name, "unexpected crash") # others errors logger.exception("engine {0} : exception : {1}".format(engine_name, e)) else: if getattr(threading.current_thread(), "_timeout", False): record_error(engine_name, "Timeout") # suspend the engine if there is an HTTP error # or suspended_time is defined with threading.RLock(): if requests_exception or suspended_time: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 if suspended_time is None: suspended_time = min( settings["search"]["max_ban_time_on_fail"], engine.continuous_errors * settings["search"]["ban_time_on_fail"], ) engine.suspend_end_time = time() + suspended_time else: # reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0