def load_https_rules(rules_path): # check if directory exists if not isdir(rules_path): logger.error("directory not found: '" + rules_path + "'") return # search all xml files which are stored in the https rule directory xml_files = [ join(rules_path, f) for f in listdir(rules_path) if isfile(join(rules_path, f)) and f[-4:] == '.xml' ] # load xml-files for ruleset_file in xml_files: # calculate rewrite-rules ruleset = load_single_https_ruleset(ruleset_file) # skip if no ruleset returned if not ruleset: continue # append ruleset https_rules.append(ruleset) logger.info('{n} rules loaded'.format(n=len(https_rules)))
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): engine = engines[engine_name] try: # send requests and parse the results search_results = search_one_request(engine, query, request_params, start_time, timeout_limit) # add results result_container.extend(engine_name, search_results) # update engine time when there is no exception with threading.RLock(): engine.stats['engine_time'] += time() - start_time engine.stats['engine_time_count'] += 1 return True except Exception as e: engine.stats['errors'] += 1 search_duration = time() - start_time requests_exception = False if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine( (engine_name, gettext('timeout'))) # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, search_duration, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine( (engine_name, gettext('request exception'))) # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, search_duration, timeout_limit, e)) requests_exception = True else: result_container.add_unresponsive_engine( (engine_name, gettext('unexpected crash'))) # others errors logger.exception('engine {0} : exception : {1}'.format( engine_name, e)) # update continuous_errors / suspend_end_time if requests_exception: with threading.RLock(): engine.continuous_errors += 1 engine.suspend_end_time = time() + min( 60, engine.continuous_errors) # return False
def load_engine(engine_data): """Load engine from ``engine_data``. :param dict engine_data: Attributes from YAML ``settings:engines/<engine>`` :return: initialized namespace of the ``<engine>``. 1. create a namespace and load module of the ``<engine>`` 2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS` 3. update namespace with values from ``engine_data`` If engine *is active*, return namespace of the engine, otherwise return ``None``. This function also returns ``None`` if initialization of the namespace fails for one of the following reasons: - engine name contains underscore - engine name is not lowercase - required attribute is not set :py:func:`is_missing_required_attributes` """ engine_name = engine_data['name'] if '_' in engine_name: logger.error( 'Engine name contains underscore: "{}"'.format(engine_name)) return None if engine_name.lower() != engine_name: logger.warn( 'Engine name is not lowercase: "{}", converting to lowercase'. format(engine_name)) engine_name = engine_name.lower() engine_data['name'] = engine_name # load_module engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', ENGINE_DIR) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): logger.exception( 'Fatal exception in engine "{}"'.format(engine_module)) sys.exit(1) except BaseException: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None update_engine_attributes(engine, engine_data) set_language_attributes(engine) update_attributes_for_tor(engine) if not is_engine_active(engine): return None if is_missing_required_attributes(engine): return None return engine
def initialize(engine_list): engines.initialize_engines(engine_list) for engine_name, engine in engines.engines.items(): processor = get_processor(engine, engine_name) if processor is None: logger.error('Error get processor for engine %s', engine_name) else: processors[engine_name] = processor
def load_engine(engine_data): engine_name = engine_data["engine"] engine = load_module(engine_name + ".py") for param_name in engine_data: if param_name == "engine": continue if param_name == "categories": if engine_data["categories"] == "none": engine.categories = [] else: engine.categories = map(str.strip, engine_data["categories"].split(",")) continue setattr(engine, param_name, engine_data[param_name]) if not hasattr(engine, "paging"): engine.paging = False if not hasattr(engine, "categories"): engine.categories = ["general"] if not hasattr(engine, "language_support"): engine.language_support = True if not hasattr(engine, "safesearch"): engine.safesearch = False if not hasattr(engine, "timeout"): engine.timeout = settings["outgoing"]["request_timeout"] if not hasattr(engine, "shortcut"): engine.shortcut = "" if not hasattr(engine, "disabled"): engine.disabled = False # checking required variables for engine_attr in dir(engine): if engine_attr.startswith("_"): continue if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr)) sys.exit(1) engine.stats = {"result_count": 0, "search_count": 0, "page_load_time": 0, "score_count": 0, "errors": 0} if hasattr(engine, "categories"): for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) else: categories["general"].append(engine) if engine.shortcut: if engine.shortcut in engine_shortcuts: logger.error("Engine config error: ambigious shortcut: {0}".format(engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, href_xpath, 0) c_node = eval_xpath_getindex(result, content_xpath, 0) # <img id="vidthumb1" ...> img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) if img_id is None: continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) content = extract_text(eval_xpath(c_node, './/div[2]/span')) pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) results.append({ 'url': url, 'title': title, 'content': content, 'length': length, 'author': pub_info, 'thumbnail': img_src, 'template': 'videos.html', }) # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) return results
def initialize(engine_list): """Initialize all engines and store a processor for each engine in :py:obj:`processors`.""" engines.initialize_engines(engine_list) for engine_name, engine in engines.engines.items(): processor = get_processor(engine, engine_name) if processor is None: logger.error('Error get processor for engine %s', engine_name) else: processors[engine_name] = processor
def initialize(engine_list): """Initialize all engines and store a processor for each engine in :py:obj:`PROCESSORS`.""" for engine_data in engine_list: engine_name = engine_data['name'] engine = engines.engines.get(engine_name) if engine: processor = get_processor(engine, engine_name) initialize_processor(processor) if processor is None: logger.error('Error get processor for engine %s', engine_name) else: PROCESSORS[engine_name] = processor
def load_engine(engine_data): engine_name = engine_data['engine'] try: engine = load_module(engine_name + '.py') except: logger.exception('Cannot load engine "{}"'.format(engine_name)) return None for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = map(str.strip, engine_data['categories'].split(',')) continue setattr(engine, param_name, engine_data[param_name]) for arg_name, arg_value in engine_default_args.iteritems(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) engine.stats = { 'result_count': 0, 'search_count': 0, 'page_load_time': 0, 'score_count': 0, 'errors': 0 } for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def is_missing_required_attributes(engine): """An attribute is required when its name doesn't start with ``_`` (underline). Required attributes must not be ``None``. """ missing = False for engine_attr in dir(engine): if not engine_attr.startswith('_') and getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) missing = True return missing
def load_engine(engine_data): engine_name = engine_data['engine'] try: engine = load_module(engine_name + '.py') except: logger.exception('Cannot load engine "{}"'.format(engine_name)) return None for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = map( str.strip, engine_data['categories'].split(',')) continue setattr(engine, param_name, engine_data[param_name]) for arg_name, arg_value in engine_default_args.iteritems(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"' .format(engine.name, engine_attr)) sys.exit(1) engine.stats = { 'result_count': 0, 'search_count': 0, 'page_load_time': 0, 'score_count': 0, 'errors': 0 } for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def register_engine(engine): if engine.name in engines: logger.error('Engine config error: ambigious name: {0}'.format( engine.name)) sys.exit(1) engines[engine.name] = engine if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name for category_name in engine.categories: categories.setdefault(category_name, []).append(engine)
def parse_album(hit): result = { 'url': hit['result']['url'], 'title': hit['result']['full_title'], 'thumbnail': hit['result']['cover_art_url'], 'content': '', } try: year = hit['result']['release_date_components']['year'] except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) else: if year: result.update({'content': 'Released: {}'.format(year)}) return result
def parse_lyric(hit): try: content = hit['highlights'][0]['value'] except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) content = '' timestamp = hit['result']['lyrics_updated_at'] result = { 'url': hit['result']['url'], 'title': hit['result']['full_title'], 'content': content, 'thumbnail': hit['result']['song_art_image_thumbnail_url'], } if timestamp: result.update({'publishedDate': datetime.fromtimestamp(timestamp)}) return result
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): engine = engines[engine_name] try: # send requests and parse the results search_results = search_one_request(engine, query, request_params, start_time, timeout_limit) # add results result_container.extend(engine_name, search_results) # update engine time when there is no exception with threading.RLock(): engine.stats['engine_time'] += time() - start_time engine.stats['engine_time_count'] += 1 return True except Exception as e: engine.stats['errors'] += 1 search_duration = time() - start_time requests_exception = False if (issubclass(e.__class__, requests.exceptions.Timeout)): # requests timeout (connect or read) logger.error("engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}" .format(engine_name, search_duration, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): # other requests exception logger.exception("engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}" .format(engine_name, search_duration, timeout_limit, e)) requests_exception = True else: # others errors logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) # update continuous_errors / suspend_end_time if requests_exception: with threading.RLock(): engine.continuous_errors += 1 engine.suspend_end_time = time() + min(60, engine.continuous_errors) # return False
def search(self, query, params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time searx.network.reset_time_for_thread() # set the network searx.network.set_context_network_name(self.engine_name) try: # send requests and parse the results search_results = self._search_basic(query, params) self.extend_container(result_container, start_time, search_results) except (httpx.TimeoutException, asyncio.TimeoutError) as e: # requests timeout (connect or read) self.handle_exception(result_container, e, suspend=True) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( self.engine_name, default_timer() - start_time, timeout_limit, e.__class__.__name__)) except (httpx.HTTPError, httpx.StreamError) as e: # other requests exception self.handle_exception(result_container, e, suspend=True) logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( self.engine_name, default_timer() - start_time, timeout_limit, e)) except SearxEngineCaptchaException as e: self.handle_exception(result_container, e, suspend=True) logger.exception('engine {0} : CAPTCHA'.format(self.engine_name)) except SearxEngineTooManyRequestsException as e: self.handle_exception(result_container, e, suspend=True) logger.exception('engine {0} : Too many requests'.format( self.engine_name)) except SearxEngineAccessDeniedException as e: self.handle_exception(result_container, e, suspend=True) logger.exception('engine {0} : Searx is blocked'.format( self.engine_name)) except Exception as e: # pylint: disable=broad-except self.handle_exception(result_container, e) logger.exception('engine {0} : exception : {1}'.format( self.engine_name, e))
def _is_url_image(image_url): if not isinstance(image_url, str): return False if image_url.startswith('//'): image_url = 'https:' + image_url if image_url.startswith('data:'): return image_url.startswith('data:image/') if not _is_url(image_url): return False retry = 2 while retry > 0: a = time() try: network.set_timeout_for_thread(10.0, time()) r = network.get( image_url, timeout=10.0, allow_redirects=True, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-GPC': '1', 'Cache-Control': 'max-age=0' }) if r.headers["content-type"].startswith('image/'): return True return False except httpx.TimeoutException: logger.error('Timeout for %s: %i', image_url, int(time() - a)) retry -= 1 except httpx.HTTPError: logger.exception('Exception for %s', image_url) return False
def load_engine(engine_data): if "_" in engine_data["name"]: logger.error('Engine name conains underscore: "{}"'.format(engine_data["name"])) sys.exit(1) engine_module = engine_data["engine"] try: engine = load_module(engine_module + ".py") except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name in engine_data: if param_name == "engine": continue if param_name == "categories": if engine_data["categories"] == "none": engine.categories = [] else: engine.categories = map(str.strip, engine_data["categories"].split(",")) continue setattr(engine, param_name, engine_data[param_name]) for arg_name, arg_value in engine_default_args.iteritems(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith("_"): continue if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr)) sys.exit(1) engine.stats = {"result_count": 0, "search_count": 0, "page_load_time": 0, "score_count": 0, "errors": 0} for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error("Engine config error: ambigious shortcut: {0}".format(engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def search_one_http_request_safe( engine_name, query, request_params, result_container, start_time, timeout_limit ): # set timeout for all HTTP requests requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time requests_lib.reset_time_for_thread() # engine = engines[engine_name] # suppose everything will be alright requests_exception = False suspended_time = None try: # send requests and parse the results search_results = search_one_http_request(engine, query, request_params) # check if the engine accepted the request if search_results is not None: # yes, so add results result_container.extend(engine_name, search_results) # update engine time when there is no exception engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) with threading.RLock(): engine.stats["engine_time"] += engine_time engine.stats["engine_time_count"] += 1 # update stats with the total HTTP time engine.stats["page_load_time"] += page_load_time engine.stats["page_load_count"] += 1 except Exception as e: record_exception(engine_name, e) # Timing engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): engine.stats["errors"] += 1 if issubclass(e.__class__, requests.exceptions.Timeout): result_container.add_unresponsive_engine(engine_name, "HTTP timeout") # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e.__class__.__name__ ) ) requests_exception = True elif issubclass(e.__class__, requests.exceptions.RequestException): result_container.add_unresponsive_engine(engine_name, "HTTP error") # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e ) ) requests_exception = True elif issubclass(e.__class__, SearxEngineCaptchaException): result_container.add_unresponsive_engine(engine_name, "CAPTCHA required") logger.exception("engine {0} : CAPTCHA") suspended_time = e.suspended_time # pylint: disable=no-member elif issubclass(e.__class__, SearxEngineTooManyRequestsException): result_container.add_unresponsive_engine(engine_name, "too many requests") logger.exception("engine {0} : Too many requests") suspended_time = e.suspended_time # pylint: disable=no-member elif issubclass(e.__class__, SearxEngineAccessDeniedException): result_container.add_unresponsive_engine(engine_name, "blocked") logger.exception("engine {0} : Searx is blocked") suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine(engine_name, "unexpected crash") # others errors logger.exception("engine {0} : exception : {1}".format(engine_name, e)) else: if getattr(threading.current_thread(), "_timeout", False): record_error(engine_name, "Timeout") # suspend the engine if there is an HTTP error # or suspended_time is defined with threading.RLock(): if requests_exception or suspended_time: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 if suspended_time is None: suspended_time = min( settings["search"]["max_ban_time_on_fail"], engine.continuous_errors * settings["search"]["ban_time_on_fail"], ) engine.suspend_end_time = time() + suspended_time else: # reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0
def search(self, query, params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time poolrequests.reset_time_for_thread() # enable HTTP only if explicitly enabled poolrequests.set_enable_http_protocol(self.engine.enable_http) # suppose everything will be alright requests_exception = False suspended_time = None try: # send requests and parse the results search_results = self._search_basic(query, params) # check if the engine accepted the request if search_results is not None: # yes, so add results result_container.extend(self.engine_name, search_results) # update engine time when there is no exception engine_time = time() - start_time page_load_time = poolrequests.get_time_for_thread() result_container.add_timing(self.engine_name, engine_time, page_load_time) with threading.RLock(): self.engine.stats['engine_time'] += engine_time self.engine.stats['engine_time_count'] += 1 # update stats with the total HTTP time self.engine.stats['page_load_time'] += page_load_time self.engine.stats['page_load_count'] += 1 except Exception as e: record_exception(self.engine_name, e) # Timing engine_time = time() - start_time page_load_time = poolrequests.get_time_for_thread() result_container.add_timing(self.engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): self.engine.stats['errors'] += 1 if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine( self.engine_name, 'HTTP timeout') # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( self.engine_name, engine_time, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine( self.engine_name, 'HTTP error') # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( self.engine_name, engine_time, timeout_limit, e)) requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine( self.engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA'.format( self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): result_container.add_unresponsive_engine( self.engine_name, 'too many requests') logger.exception('engine {0} : Too many requests'.format( self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): result_container.add_unresponsive_engine( self.engine_name, 'blocked') logger.exception('engine {0} : Searx is blocked'.format( self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine( self.engine_name, 'unexpected crash') # others errors logger.exception('engine {0} : exception : {1}'.format( self.engine_name, e)) else: if getattr(threading.current_thread(), '_timeout', False): record_error(self.engine_name, 'Timeout') # suspend the engine if there is an HTTP error # or suspended_time is defined with threading.RLock(): if requests_exception or suspended_time: # update continuous_errors / suspend_end_time self.engine.continuous_errors += 1 if suspended_time is None: suspended_time = min( settings['search']['max_ban_time_on_fail'], self.engine.continuous_errors * settings['search']['ban_time_on_fail']) self.engine.suspend_end_time = time() + suspended_time else: # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0
engine['percentage'] = 0 for engine in errors: if max_errors: engine['percentage'] = int(float(engine['avg']) / max_errors * 100) else: engine['percentage'] = 0 return [ (gettext('Page loads (sec)'), sorted(pageloads, key=itemgetter('avg'))), (gettext('Number of results'), sorted(results, key=itemgetter('avg'), reverse=True)), (gettext('Scores'), sorted(scores, key=itemgetter('avg'), reverse=True)), (gettext('Scores per result'), sorted(scores_per_result, key=itemgetter('avg'), reverse=True)), (gettext('Errors'), sorted(errors, key=itemgetter('avg'), reverse=True)), ] if 'engines' not in settings or not settings['engines']: logger.error('No engines found. Edit your settings.yml') exit(2) for engine_data in settings['engines']: engine = load_engine(engine_data) if engine is not None: engines[engine.name] = engine
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0] _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath(result, title_xpath) if not title_tag: # this not one of the common google results *section* logger.debug('ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag[0]) url = eval_xpath(result, href_xpath)[0] content = extract_text_from_dom(result, content_xpath) results.append({ 'url': url, 'title': title, 'content': content }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time requests_lib.reset_time_for_thread() # engine = engines[engine_name] # suppose everything will be alright requests_exception = False try: # send requests and parse the results search_results = search_one_request(engine, query, request_params) # add results result_container.extend(engine_name, search_results) # update engine time when there is no exception with threading.RLock(): engine.stats['engine_time'] += time() - start_time engine.stats['engine_time_count'] += 1 # update stats with the total HTTP time engine.stats['page_load_time'] += requests_lib.get_time_for_thread() engine.stats['page_load_count'] += 1 except Exception as e: search_duration = time() - start_time with threading.RLock(): engine.stats['errors'] += 1 if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine((engine_name, gettext('timeout'))) # requests timeout (connect or read) logger.error("engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}" .format(engine_name, search_duration, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine((engine_name, gettext('request exception'))) # other requests exception logger.exception("engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}" .format(engine_name, search_duration, timeout_limit, e)) requests_exception = True else: result_container.add_unresponsive_engine(( engine_name, u'{0}: {1}'.format(gettext('unexpected crash'), e), )) # others errors logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) # suspend or not the engine if there are HTTP errors with threading.RLock(): if requests_exception: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 engine.suspend_end_time = time() + min(60, engine.continuous_errors) else: # no HTTP error (perhaps an engine error) # anyway, reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0
from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.plugins import plugins from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.answerers import answerers from searx.poolrequests import get_global_proxies from searx.metrology.error_recorder import errors_per_engines # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) # check secret_key if not searx_debug and settings['server']['secret_key'] == 'ultrasecretkey': logger.error('server.secret_key is not changed. Please use something else instead of ultrasecretkey.') exit(1) # about static static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path']) logger.debug('static directory is %s', static_path) static_files = get_static_files(static_path) # about templates default_theme = settings['ui']['default_theme'] templates_path = get_resources_directory(searx_dir, 'templates', settings['ui']['templates_path']) logger.debug('templates directory is %s', templates_path) themes = get_themes(templates_path) result_templates = get_result_templates(templates_path) global_favicons = [] for indice, theme in enumerate(themes):
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0) content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def load_engine(engine_data): if '_' in engine_data['name']: logger.error('Engine name conains underscore: "{}"'.format(engine_data['name'])) sys.exit(1) engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', engine_dir) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = list(map(str.strip, engine_data['categories'].split(','))) continue setattr(engine, param_name, engine_data[param_name]) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"' .format(engine.name, engine_attr)) sys.exit(1) # assign supported languages from json file if engine_data['name'] in languages: setattr(engine, 'supported_languages', languages[engine_data['name']]) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): setattr(engine, 'fetch_supported_languages', lambda: engine._fetch_supported_languages(get(engine.supported_languages_url))) engine.stats = { 'result_count': 0, 'search_count': 0, 'page_load_time': 0, 'page_load_count': 0, 'engine_time': 0, 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath_getindex( dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): img_alt = eval_xpath_getindex(img_node, '@alt', 0) img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) url = eval_xpath_getindex(link_node, '@href', 0) pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) return results
def load_engine(engine_data): engine_name = engine_data['name'] if '_' in engine_name: logger.error( 'Engine name contains underscore: "{}"'.format(engine_name)) sys.exit(1) if engine_name.lower() != engine_name: logger.warn( 'Engine name is not lowercase: "{}", converting to lowercase'. format(engine_name)) engine_name = engine_name.lower() engine_data['name'] = engine_name engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', engine_dir) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError) as e: logger.exception( 'Fatal exception in engine "{}"'.format(engine_module)) sys.exit(1) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = list( map(str.strip, engine_data['categories'].split(','))) continue setattr(engine, param_name, engine_data[param_name]) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) # assign supported languages from json file if engine_data['name'] in languages: setattr(engine, 'supported_languages', languages[engine_data['name']]) # find custom aliases for non standard language codes if hasattr(engine, 'supported_languages'): if hasattr(engine, 'language_aliases'): language_aliases = getattr(engine, 'language_aliases') else: language_aliases = {} for engine_lang in getattr(engine, 'supported_languages'): iso_lang = match_language(engine_lang, babel_langs, fallback=None) if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ iso_lang not in getattr(engine, 'supported_languages'): language_aliases[iso_lang] = engine_lang setattr(engine, 'language_aliases', language_aliases) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): setattr( engine, 'fetch_supported_languages', lambda: engine._fetch_supported_languages( get(engine.supported_languages_url))) engine.stats = { 'result_count': 0, 'search_count': 0, 'engine_time': 0, 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } if not engine.offline: engine.stats['page_load_time'] = 0 engine.stats['page_load_count'] = 0 for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time requests_lib.reset_time_for_thread() # engine = engines[engine_name] # suppose everything will be alright requests_exception = False try: # send requests and parse the results search_results = search_one_http_request(engine, query, request_params) # check if the engine accepted the request if search_results is not None: # yes, so add results result_container.extend(engine_name, search_results) # update engine time when there is no exception engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) with threading.RLock(): engine.stats['engine_time'] += engine_time engine.stats['engine_time_count'] += 1 # update stats with the total HTTP time engine.stats['page_load_time'] += page_load_time engine.stats['page_load_count'] += 1 except Exception as e: record_exception(engine_name, e) # Timing engine_time = time() - start_time page_load_time = requests_lib.get_time_for_thread() result_container.add_timing(engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): engine.stats['errors'] += 1 if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine(engine_name, 'HTTP timeout') # requests timeout (connect or read) logger.error( "engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e.__class__.__name__)) requests_exception = True elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine(engine_name, 'HTTP error') # other requests exception logger.exception( "engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}".format( engine_name, engine_time, timeout_limit, e)) requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA') else: result_container.add_unresponsive_engine(engine_name, 'unexpected crash') # others errors logger.exception('engine {0} : exception : {1}'.format( engine_name, e)) else: if getattr(threading.current_thread(), '_timeout', False): record_error(engine_name, 'Timeout') # suspend or not the engine if there are HTTP errors with threading.RLock(): if requests_exception: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 engine.suspend_end_time = time() + min( settings['search']['max_ban_time_on_fail'], engine.continuous_errors * settings['search']['ban_time_on_fail']) else: # no HTTP error (perhaps an engine error) # anyway, reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect": raise SearxEngineCaptchaException() if resp_url.path.startswith("/sorry"): raise SearxEngineCaptchaException() # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath( dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, "@alt")[0] img_base64_id = eval_xpath(img_node, "@data-iid") if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, "@src") if not thumbnail_src: thumbnail_src = eval_xpath(img_node, "@data-src") if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = "" link_node = eval_xpath(img_node, "../../../a[2]")[0] url = eval_xpath(link_node, "@href")[0] pub_nodes = eval_xpath(link_node, "./div/div") pub_descr = img_alt pub_source = "" if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath(img_node, "../../../@data-id")[0] src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ "url": url, "title": img_alt, "content": pub_descr, "source": pub_source, "img_src": src_url, "img_format": { "width": int(eval_xpath(img_node, "@width")[0]), "height": int(eval_xpath(img_node, "@height")[0]), }, "thumbnail_src": thumbnail_src, "template": "images.html", }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
for engine in scores_per_result: if max_score_per_result: engine["percentage"] = int(engine["avg"] / max_score_per_result * 100) else: engine["percentage"] = 0 for engine in errors: if max_errors: engine["percentage"] = int(float(engine["avg"]) / max_errors * 100) else: engine["percentage"] = 0 return [ (gettext("Page loads (sec)"), sorted(pageloads, key=itemgetter("avg"))), (gettext("Number of results"), sorted(results, key=itemgetter("avg"), reverse=True)), (gettext("Scores"), sorted(scores, key=itemgetter("avg"), reverse=True)), (gettext("Scores per result"), sorted(scores_per_result, key=itemgetter("avg"), reverse=True)), (gettext("Errors"), sorted(errors, key=itemgetter("avg"), reverse=True)), ] if "engines" not in settings or not settings["engines"]: logger.error("No engines found. Edit your settings.yml") exit(2) for engine_data in settings["engines"]: engine = load_engine(engine_data) if engine is not None: engines[engine.name] = engine
def load_engine(engine_data): engine_name = engine_data['name'] if '_' in engine_name: logger.error( 'Engine name contains underscore: "{}"'.format(engine_name)) sys.exit(1) if engine_name.lower() != engine_name: logger.warn( 'Engine name is not lowercase: "{}", converting to lowercase'. format(engine_name)) engine_name = engine_name.lower() engine_data['name'] = engine_name engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', engine_dir) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): logger.exception( 'Fatal exception in engine "{}"'.format(engine_module)) sys.exit(1) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name, param_value in engine_data.items(): if param_name == 'engine': pass elif param_name == 'categories': if param_value == 'none': engine.categories = [] else: engine.categories = list(map(str.strip, param_value.split(','))) else: setattr(engine, param_name, param_value) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) # assign supported languages from json file if engine_data['name'] in ENGINES_LANGUAGES: setattr(engine, 'supported_languages', ENGINES_LANGUAGES[engine_data['name']]) # find custom aliases for non standard language codes if hasattr(engine, 'supported_languages'): if hasattr(engine, 'language_aliases'): language_aliases = getattr(engine, 'language_aliases') else: language_aliases = {} for engine_lang in getattr(engine, 'supported_languages'): iso_lang = match_language(engine_lang, babel_langs, fallback=None) if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ iso_lang not in getattr(engine, 'supported_languages'): language_aliases[iso_lang] = engine_lang setattr(engine, 'language_aliases', language_aliases) # language_support setattr(engine, 'language_support', len(getattr(engine, 'supported_languages', [])) > 0) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): headers = { 'User-Agent': gen_useragent(), 'Accept-Language': 'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3', # bing needs a non-English language } setattr( engine, 'fetch_supported_languages', lambda: engine._fetch_supported_languages( get(engine.supported_languages_url, headers=headers))) # tor related settings if settings['outgoing'].get('using_tor_proxy'): # use onion url if using tor. if hasattr(engine, 'onion_url'): engine.search_url = engine.onion_url + getattr( engine, 'search_path', '') elif 'onions' in engine.categories: # exclude onion engines if not using tor. return None engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the ling to the origin PNG, JPG or whatever is given # (we do not blow out the link there, you could still implement that) # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, '@alt')[0] img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath(img_node, '../../../a[2]')[0] url = eval_xpath(link_node, '@href')[0] pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
( gettext('Page loads (sec)'), sorted(pageloads, key=itemgetter('avg')) ), ( gettext('Number of results'), sorted(results, key=itemgetter('avg'), reverse=True) ), ( gettext('Scores'), sorted(scores, key=itemgetter('avg'), reverse=True) ), ( gettext('Scores per result'), sorted(scores_per_result, key=itemgetter('avg'), reverse=True) ), ( gettext('Errors'), sorted(errors, key=itemgetter('avg'), reverse=True) ), ] if 'engines' not in settings or not settings['engines']: logger.error('No engines found. Edit your settings.yml') exit(2) for engine_data in settings['engines']: engine = load_engine(engine_data) engines[engine.name] = engine
def load_engine(engine_data): engine_name = engine_data['engine'] engine = load_module(engine_name + '.py') for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = map( str.strip, engine_data['categories'].split(',')) continue setattr(engine, param_name, engine_data[param_name]) if not hasattr(engine, 'paging'): engine.paging = False if not hasattr(engine, 'categories'): engine.categories = ['general'] if not hasattr(engine, 'language_support'): engine.language_support = True if not hasattr(engine, 'timeout'): engine.timeout = settings['server']['request_timeout'] if not hasattr(engine, 'shortcut'): engine.shortcut = '' if not hasattr(engine, 'disabled'): engine.disabled = False # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"' .format(engine.name, engine_attr)) sys.exit(1) engine.stats = { 'result_count': 0, 'search_count': 0, 'page_load_time': 0, 'score_count': 0, 'errors': 0 } if hasattr(engine, 'categories'): for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) else: categories['general'].append(engine) if engine.shortcut: if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}' .format(engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def load_engine(engine_data): engine_name = engine_data['engine'] engine = load_module(engine_name + '.py') for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = map(str.strip, engine_data['categories'].split(',')) continue setattr(engine, param_name, engine_data[param_name]) if not hasattr(engine, 'paging'): engine.paging = False if not hasattr(engine, 'categories'): engine.categories = ['general'] if not hasattr(engine, 'language_support'): engine.language_support = True if not hasattr(engine, 'timeout'): engine.timeout = settings['server']['request_timeout'] if not hasattr(engine, 'shortcut'): engine.shortcut = '' if not hasattr(engine, 'disabled'): engine.disabled = False # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) engine.stats = { 'result_count': 0, 'search_count': 0, 'page_load_time': 0, 'score_count': 0, 'errors': 0 } if hasattr(engine, 'categories'): for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) else: categories['general'].append(engine) if engine.shortcut: if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: answer_list = [_.xpath("normalize-space()") for _ in answer_list] results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") # results --> number_of_results if not use_mobile_ui: try: _txt = eval_xpath_getindex( dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results _results_xpath = results_xpath if use_mobile_ui: _results_xpath = results_xpath_mobile_ui for result in eval_xpath_list(dom, _results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) if content is None: logger.debug( 'ingoring item from the result_xpath list: missing content of title "%s"', title) continue logger.debug('add link to results: %s', title) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results