Beispiel #1
0
def load_https_rules(rules_path):
    # check if directory exists
    if not isdir(rules_path):
        logger.error("directory not found: '" + rules_path + "'")
        return

    # search all xml files which are stored in the https rule directory
    xml_files = [
        join(rules_path, f) for f in listdir(rules_path)
        if isfile(join(rules_path, f)) and f[-4:] == '.xml'
    ]

    # load xml-files
    for ruleset_file in xml_files:
        # calculate rewrite-rules
        ruleset = load_single_https_ruleset(ruleset_file)

        # skip if no ruleset returned
        if not ruleset:
            continue

        # append ruleset
        https_rules.append(ruleset)

    logger.info('{n} rules loaded'.format(n=len(https_rules)))
def search_one_request_safe(engine_name, query, request_params,
                            result_container, start_time, timeout_limit):
    engine = engines[engine_name]

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params,
                                            start_time, timeout_limit)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1

        return True

    except Exception as e:
        engine.stats['errors'] += 1

        search_duration = time() - start_time
        requests_exception = False

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine(
                (engine_name, gettext('timeout')))
            # requests timeout (connect or read)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, search_duration, timeout_limit,
                    e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine(
                (engine_name, gettext('request exception')))
            # other requests exception
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            result_container.add_unresponsive_engine(
                (engine_name, gettext('unexpected crash')))
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(
                engine_name, e))

        # update continuous_errors / suspend_end_time
        if requests_exception:
            with threading.RLock():
                engine.continuous_errors += 1
                engine.suspend_end_time = time() + min(
                    60, engine.continuous_errors)

        #
        return False
Beispiel #3
0
def load_engine(engine_data):
    """Load engine from ``engine_data``.

    :param dict engine_data:  Attributes from YAML ``settings:engines/<engine>``
    :return: initialized namespace of the ``<engine>``.

    1. create a namespace and load module of the ``<engine>``
    2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
    3. update namespace with values from ``engine_data``

    If engine *is active*, return namespace of the engine, otherwise return
    ``None``.

    This function also returns ``None`` if initialization of the namespace fails
    for one of the following reasons:

    - engine name contains underscore
    - engine name is not lowercase
    - required attribute is not set :py:func:`is_missing_required_attributes`

    """

    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        return None

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    # load_module
    engine_module = engine_data['engine']
    try:
        engine = load_module(engine_module + '.py', ENGINE_DIR)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError):
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except BaseException:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    update_engine_attributes(engine, engine_data)
    set_language_attributes(engine)
    update_attributes_for_tor(engine)

    if not is_engine_active(engine):
        return None

    if is_missing_required_attributes(engine):
        return None

    return engine
Beispiel #4
0
def initialize(engine_list):
    engines.initialize_engines(engine_list)
    for engine_name, engine in engines.engines.items():
        processor = get_processor(engine, engine_name)
        if processor is None:
            logger.error('Error get processor for engine %s', engine_name)
        else:
            processors[engine_name] = processor
Beispiel #5
0
def load_engine(engine_data):
    engine_name = engine_data["engine"]
    engine = load_module(engine_name + ".py")

    for param_name in engine_data:
        if param_name == "engine":
            continue
        if param_name == "categories":
            if engine_data["categories"] == "none":
                engine.categories = []
            else:
                engine.categories = map(str.strip, engine_data["categories"].split(","))
            continue
        setattr(engine, param_name, engine_data[param_name])

    if not hasattr(engine, "paging"):
        engine.paging = False

    if not hasattr(engine, "categories"):
        engine.categories = ["general"]

    if not hasattr(engine, "language_support"):
        engine.language_support = True

    if not hasattr(engine, "safesearch"):
        engine.safesearch = False

    if not hasattr(engine, "timeout"):
        engine.timeout = settings["outgoing"]["request_timeout"]

    if not hasattr(engine, "shortcut"):
        engine.shortcut = ""

    if not hasattr(engine, "disabled"):
        engine.disabled = False

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith("_"):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {"result_count": 0, "search_count": 0, "page_load_time": 0, "score_count": 0, "errors": 0}

    if hasattr(engine, "categories"):
        for category_name in engine.categories:
            categories.setdefault(category_name, []).append(engine)
    else:
        categories["general"].append(engine)

    if engine.shortcut:
        if engine.shortcut in engine_shortcuts:
            logger.error("Engine config error: ambigious shortcut: {0}".format(engine.shortcut))
            sys.exit(1)
        engine_shortcuts[engine.shortcut] = engine.name
    return engine
Beispiel #6
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    vidthumb_imgdata = scrap_out_thumbs(dom)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
        url = eval_xpath_getindex(result, href_xpath, 0)
        c_node = eval_xpath_getindex(result, content_xpath, 0)

        # <img id="vidthumb1" ...>
        img_id = eval_xpath_getindex(c_node,
                                     './div[1]//a/g-img/img/@id',
                                     0,
                                     default=None)
        if img_id is None:
            continue
        img_src = vidthumb_imgdata.get(img_id, None)
        if not img_src:
            logger.error("no vidthumb imgdata for: %s" % img_id)
            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src',
                                          0)

        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))

        results.append({
            'url': url,
            'title': title,
            'content': content,
            'length': length,
            'author': pub_info,
            'thumbnail': img_src,
            'template': 'videos.html',
        })

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    return results
Beispiel #7
0
def initialize(engine_list):
    """Initialize all engines and store a processor for each engine in :py:obj:`processors`."""
    engines.initialize_engines(engine_list)
    for engine_name, engine in engines.engines.items():
        processor = get_processor(engine, engine_name)
        if processor is None:
            logger.error('Error get processor for engine %s', engine_name)
        else:
            processors[engine_name] = processor
Beispiel #8
0
def initialize(engine_list):
    """Initialize all engines and store a processor for each engine in :py:obj:`PROCESSORS`."""
    for engine_data in engine_list:
        engine_name = engine_data['name']
        engine = engines.engines.get(engine_name)
        if engine:
            processor = get_processor(engine, engine_name)
            initialize_processor(processor)
            if processor is None:
                logger.error('Error get processor for engine %s', engine_name)
            else:
                PROCESSORS[engine_name] = processor
Beispiel #9
0
def load_engine(engine_data):
    engine_name = engine_data['engine']
    try:
        engine = load_module(engine_name + '.py')
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_name))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = map(str.strip,
                                        engine_data['categories'].split(','))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.iteritems():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'score_count': 0,
        'errors': 0
    }

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Beispiel #10
0
def is_missing_required_attributes(engine):
    """An attribute is required when its name doesn't start with ``_`` (underline).
    Required attributes must not be ``None``.

    """
    missing = False
    for engine_attr in dir(engine):
        if not engine_attr.startswith('_') and getattr(engine,
                                                       engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            missing = True
    return missing
Beispiel #11
0
def load_engine(engine_data):
    engine_name = engine_data['engine']
    try:
        engine = load_module(engine_name + '.py')
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_name))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = map(
                    str.strip, engine_data['categories'].split(','))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.iteritems():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'
                         .format(engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'score_count': 0,
        'errors': 0
    }

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Beispiel #12
0
def register_engine(engine):
    if engine.name in engines:
        logger.error('Engine config error: ambigious name: {0}'.format(
            engine.name))
        sys.exit(1)
    engines[engine.name] = engine

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)
    engine_shortcuts[engine.shortcut] = engine.name

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)
Beispiel #13
0
def parse_album(hit):
    result = {
        'url': hit['result']['url'],
        'title': hit['result']['full_title'],
        'thumbnail': hit['result']['cover_art_url'],
        'content': '',
    }
    try:
        year = hit['result']['release_date_components']['year']
    except Exception as e:  # pylint: disable=broad-except
        logger.error(e, exc_info=True)
    else:
        if year:
            result.update({'content': 'Released: {}'.format(year)})
    return result
Beispiel #14
0
def parse_lyric(hit):
    try:
        content = hit['highlights'][0]['value']
    except Exception as e:  # pylint: disable=broad-except
        logger.error(e, exc_info=True)
        content = ''
    timestamp = hit['result']['lyrics_updated_at']
    result = {
        'url': hit['result']['url'],
        'title': hit['result']['full_title'],
        'content': content,
        'thumbnail': hit['result']['song_art_image_thumbnail_url'],
    }
    if timestamp:
        result.update({'publishedDate': datetime.fromtimestamp(timestamp)})
    return result
Beispiel #15
0
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
    engine = engines[engine_name]

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params, start_time, timeout_limit)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1

        return True

    except Exception as e:
        engine.stats['errors'] += 1

        search_duration = time() - start_time
        requests_exception = False

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            # requests timeout (connect or read)
            logger.error("engine {0} : HTTP requests timeout"
                         "(search duration : {1} s, timeout: {2} s) : {3}"
                         .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            # other requests exception
            logger.exception("engine {0} : requests exception"
                             "(search duration : {1} s, timeout: {2} s) : {3}"
                             .format(engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(engine_name, e))

        # update continuous_errors / suspend_end_time
        if requests_exception:
            with threading.RLock():
                engine.continuous_errors += 1
                engine.suspend_end_time = time() + min(60, engine.continuous_errors)

        #
        return False
Beispiel #16
0
    def search(self, query, params, result_container, start_time,
               timeout_limit):
        # set timeout for all HTTP requests
        searx.network.set_timeout_for_thread(timeout_limit,
                                             start_time=start_time)
        # reset the HTTP total time
        searx.network.reset_time_for_thread()
        # set the network
        searx.network.set_context_network_name(self.engine_name)

        try:
            # send requests and parse the results
            search_results = self._search_basic(query, params)
            self.extend_container(result_container, start_time, search_results)
        except (httpx.TimeoutException, asyncio.TimeoutError) as e:
            # requests timeout (connect or read)
            self.handle_exception(result_container, e, suspend=True)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    self.engine_name,
                    default_timer() - start_time, timeout_limit,
                    e.__class__.__name__))
        except (httpx.HTTPError, httpx.StreamError) as e:
            # other requests exception
            self.handle_exception(result_container, e, suspend=True)
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    self.engine_name,
                    default_timer() - start_time, timeout_limit, e))
        except SearxEngineCaptchaException as e:
            self.handle_exception(result_container, e, suspend=True)
            logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
        except SearxEngineTooManyRequestsException as e:
            self.handle_exception(result_container, e, suspend=True)
            logger.exception('engine {0} : Too many requests'.format(
                self.engine_name))
        except SearxEngineAccessDeniedException as e:
            self.handle_exception(result_container, e, suspend=True)
            logger.exception('engine {0} : Searx is blocked'.format(
                self.engine_name))
        except Exception as e:  # pylint: disable=broad-except
            self.handle_exception(result_container, e)
            logger.exception('engine {0} : exception : {1}'.format(
                self.engine_name, e))
Beispiel #17
0
def _is_url_image(image_url):
    if not isinstance(image_url, str):
        return False

    if image_url.startswith('//'):
        image_url = 'https:' + image_url

    if image_url.startswith('data:'):
        return image_url.startswith('data:image/')

    if not _is_url(image_url):
        return False

    retry = 2

    while retry > 0:
        a = time()
        try:
            network.set_timeout_for_thread(10.0, time())
            r = network.get(
                image_url,
                timeout=10.0,
                allow_redirects=True,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Language': 'en-US;q=0.5,en;q=0.3',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'DNT': '1',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1',
                    'Sec-GPC': '1',
                    'Cache-Control': 'max-age=0'
                })
            if r.headers["content-type"].startswith('image/'):
                return True
            return False
        except httpx.TimeoutException:
            logger.error('Timeout for %s: %i', image_url, int(time() - a))
            retry -= 1
        except httpx.HTTPError:
            logger.exception('Exception for %s', image_url)
            return False
Beispiel #18
0
def load_engine(engine_data):

    if "_" in engine_data["name"]:
        logger.error('Engine name conains underscore: "{}"'.format(engine_data["name"]))
        sys.exit(1)

    engine_module = engine_data["engine"]

    try:
        engine = load_module(engine_module + ".py")
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == "engine":
            continue
        if param_name == "categories":
            if engine_data["categories"] == "none":
                engine.categories = []
            else:
                engine.categories = map(str.strip, engine_data["categories"].split(","))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.iteritems():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith("_"):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {"result_count": 0, "search_count": 0, "page_load_time": 0, "score_count": 0, "errors": 0}

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error("Engine config error: ambigious shortcut: {0}".format(engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Beispiel #19
0
def search_one_http_request_safe(
    engine_name, query, request_params, result_container, start_time, timeout_limit
):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False
    suspended_time = None

    try:
        # send requests and parse the results
        search_results = search_one_http_request(engine, query, request_params)

        # check if the engine accepted the request
        if search_results is not None:
            # yes, so add results
            result_container.extend(engine_name, search_results)

            # update engine time when there is no exception
            engine_time = time() - start_time
            page_load_time = requests_lib.get_time_for_thread()
            result_container.add_timing(engine_name, engine_time, page_load_time)
            with threading.RLock():
                engine.stats["engine_time"] += engine_time
                engine.stats["engine_time_count"] += 1
                # update stats with the total HTTP time
                engine.stats["page_load_time"] += page_load_time
                engine.stats["page_load_count"] += 1
    except Exception as e:
        record_exception(engine_name, e)

        # Timing
        engine_time = time() - start_time
        page_load_time = requests_lib.get_time_for_thread()
        result_container.add_timing(engine_name, engine_time, page_load_time)

        # Record the errors
        with threading.RLock():
            engine.stats["errors"] += 1

        if issubclass(e.__class__, requests.exceptions.Timeout):
            result_container.add_unresponsive_engine(engine_name, "HTTP timeout")
            # requests timeout (connect or read)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit, e.__class__.__name__
                )
            )
            requests_exception = True
        elif issubclass(e.__class__, requests.exceptions.RequestException):
            result_container.add_unresponsive_engine(engine_name, "HTTP error")
            # other requests exception
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit, e
                )
            )
            requests_exception = True
        elif issubclass(e.__class__, SearxEngineCaptchaException):
            result_container.add_unresponsive_engine(engine_name, "CAPTCHA required")
            logger.exception("engine {0} : CAPTCHA")
            suspended_time = e.suspended_time  # pylint: disable=no-member
        elif issubclass(e.__class__, SearxEngineTooManyRequestsException):
            result_container.add_unresponsive_engine(engine_name, "too many requests")
            logger.exception("engine {0} : Too many requests")
            suspended_time = e.suspended_time  # pylint: disable=no-member
        elif issubclass(e.__class__, SearxEngineAccessDeniedException):
            result_container.add_unresponsive_engine(engine_name, "blocked")
            logger.exception("engine {0} : Searx is blocked")
            suspended_time = e.suspended_time  # pylint: disable=no-member
        else:
            result_container.add_unresponsive_engine(engine_name, "unexpected crash")
            # others errors
            logger.exception("engine {0} : exception : {1}".format(engine_name, e))
    else:
        if getattr(threading.current_thread(), "_timeout", False):
            record_error(engine_name, "Timeout")

    # suspend the engine if there is an HTTP error
    # or suspended_time is defined
    with threading.RLock():
        if requests_exception or suspended_time:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            if suspended_time is None:
                suspended_time = min(
                    settings["search"]["max_ban_time_on_fail"],
                    engine.continuous_errors * settings["search"]["ban_time_on_fail"],
                )
            engine.suspend_end_time = time() + suspended_time
        else:
            # reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
Beispiel #20
0
    def search(self, query, params, result_container, start_time,
               timeout_limit):
        # set timeout for all HTTP requests
        poolrequests.set_timeout_for_thread(timeout_limit,
                                            start_time=start_time)
        # reset the HTTP total time
        poolrequests.reset_time_for_thread()
        # enable HTTP only if explicitly enabled
        poolrequests.set_enable_http_protocol(self.engine.enable_http)

        # suppose everything will be alright
        requests_exception = False
        suspended_time = None

        try:
            # send requests and parse the results
            search_results = self._search_basic(query, params)

            # check if the engine accepted the request
            if search_results is not None:
                # yes, so add results
                result_container.extend(self.engine_name, search_results)

                # update engine time when there is no exception
                engine_time = time() - start_time
                page_load_time = poolrequests.get_time_for_thread()
                result_container.add_timing(self.engine_name, engine_time,
                                            page_load_time)
                with threading.RLock():
                    self.engine.stats['engine_time'] += engine_time
                    self.engine.stats['engine_time_count'] += 1
                    # update stats with the total HTTP time
                    self.engine.stats['page_load_time'] += page_load_time
                    self.engine.stats['page_load_count'] += 1
        except Exception as e:
            record_exception(self.engine_name, e)

            # Timing
            engine_time = time() - start_time
            page_load_time = poolrequests.get_time_for_thread()
            result_container.add_timing(self.engine_name, engine_time,
                                        page_load_time)

            # Record the errors
            with threading.RLock():
                self.engine.stats['errors'] += 1

            if (issubclass(e.__class__, requests.exceptions.Timeout)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'HTTP timeout')
                # requests timeout (connect or read)
                logger.error(
                    "engine {0} : HTTP requests timeout"
                    "(search duration : {1} s, timeout: {2} s) : {3}".format(
                        self.engine_name, engine_time, timeout_limit,
                        e.__class__.__name__))
                requests_exception = True
            elif (issubclass(e.__class__,
                             requests.exceptions.RequestException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'HTTP error')
                # other requests exception
                logger.exception(
                    "engine {0} : requests exception"
                    "(search duration : {1} s, timeout: {2} s) : {3}".format(
                        self.engine_name, engine_time, timeout_limit, e))
                requests_exception = True
            elif (issubclass(e.__class__, SearxEngineCaptchaException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'CAPTCHA required')
                logger.exception('engine {0} : CAPTCHA'.format(
                    self.engine_name))
                suspended_time = e.suspended_time  # pylint: disable=no-member
            elif (issubclass(e.__class__,
                             SearxEngineTooManyRequestsException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'too many requests')
                logger.exception('engine {0} : Too many requests'.format(
                    self.engine_name))
                suspended_time = e.suspended_time  # pylint: disable=no-member
            elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
                result_container.add_unresponsive_engine(
                    self.engine_name, 'blocked')
                logger.exception('engine {0} : Searx is blocked'.format(
                    self.engine_name))
                suspended_time = e.suspended_time  # pylint: disable=no-member
            else:
                result_container.add_unresponsive_engine(
                    self.engine_name, 'unexpected crash')
                # others errors
                logger.exception('engine {0} : exception : {1}'.format(
                    self.engine_name, e))
        else:
            if getattr(threading.current_thread(), '_timeout', False):
                record_error(self.engine_name, 'Timeout')

        # suspend the engine if there is an HTTP error
        # or suspended_time is defined
        with threading.RLock():
            if requests_exception or suspended_time:
                # update continuous_errors / suspend_end_time
                self.engine.continuous_errors += 1
                if suspended_time is None:
                    suspended_time = min(
                        settings['search']['max_ban_time_on_fail'],
                        self.engine.continuous_errors *
                        settings['search']['ban_time_on_fail'])
                self.engine.suspend_end_time = time() + suspended_time
            else:
                # reset the suspend variables
                self.engine.continuous_errors = 0
                self.engine.suspend_end_time = 0
Beispiel #21
0
            engine['percentage'] = 0

    for engine in errors:
        if max_errors:
            engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
        else:
            engine['percentage'] = 0

    return [
        (gettext('Page loads (sec)'), sorted(pageloads,
                                             key=itemgetter('avg'))),
        (gettext('Number of results'),
         sorted(results, key=itemgetter('avg'), reverse=True)),
        (gettext('Scores'), sorted(scores, key=itemgetter('avg'),
                                   reverse=True)),
        (gettext('Scores per result'),
         sorted(scores_per_result, key=itemgetter('avg'), reverse=True)),
        (gettext('Errors'), sorted(errors, key=itemgetter('avg'),
                                   reverse=True)),
    ]


if 'engines' not in settings or not settings['engines']:
    logger.error('No engines found. Edit your settings.yml')
    exit(2)

for engine_data in settings['engines']:
    engine = load_engine(engine_data)
    if engine is not None:
        engines[engine.name] = engine
Beispiel #22
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

    # results --> number_of_results
    try:
        _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0]
        _digit = ''.join([n for n in _txt if n.isdigit()])
        number_of_results = int(_digit)
        results.append({'number_of_results': number_of_results})

    except Exception as e:  # pylint: disable=broad-except
        logger.debug("did not 'number_of_results'")
        logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath(result, title_xpath)
            if not title_tag:
                # this not one of the common google results *section*
                logger.debug('ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag[0])
            url = eval_xpath(result, href_xpath)[0]
            content = extract_text_from_dom(result, content_xpath)
            results.append({
                'url': url,
                'title': title,
                'content': content
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Beispiel #23
0
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1
            # update stats with the total HTTP time
            engine.stats['page_load_time'] += requests_lib.get_time_for_thread()
            engine.stats['page_load_count'] += 1

    except Exception as e:
        search_duration = time() - start_time

        with threading.RLock():
            engine.stats['errors'] += 1

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine((engine_name, gettext('timeout')))
            # requests timeout (connect or read)
            logger.error("engine {0} : HTTP requests timeout"
                         "(search duration : {1} s, timeout: {2} s) : {3}"
                         .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine((engine_name, gettext('request exception')))
            # other requests exception
            logger.exception("engine {0} : requests exception"
                             "(search duration : {1} s, timeout: {2} s) : {3}"
                             .format(engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            result_container.add_unresponsive_engine((
                engine_name,
                u'{0}: {1}'.format(gettext('unexpected crash'), e),
            ))
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(engine_name, e))

    # suspend or not the engine if there are HTTP errors
    with threading.RLock():
        if requests_exception:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60, engine.continuous_errors)
        else:
            # no HTTP error (perhaps an engine error)
            # anyway, reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
Beispiel #24
0
from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins
from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers
from searx.poolrequests import get_global_proxies
from searx.metrology.error_recorder import errors_per_engines


# serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))

# check secret_key
if not searx_debug and settings['server']['secret_key'] == 'ultrasecretkey':
    logger.error('server.secret_key is not changed. Please use something else instead of ultrasecretkey.')
    exit(1)

# about static
static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path'])
logger.debug('static directory is %s', static_path)
static_files = get_static_files(static_path)

# about templates
default_theme = settings['ui']['default_theme']
templates_path = get_resources_directory(searx_dir, 'templates', settings['ui']['templates_path'])
logger.debug('templates directory is %s', templates_path)
themes = get_themes(templates_path)
result_templates = get_result_templates(templates_path)
global_favicons = []
for indice, theme in enumerate(themes):
Beispiel #25
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

        # results --> number_of_results
        try:
            _txt = eval_xpath_getindex(dom,
                                       '//div[@id="result-stats"]//text()', 0)
            _digit = ''.join([n for n in _txt if n.isdigit()])
            number_of_results = int(_digit)
            results.append({'number_of_results': number_of_results})
        except Exception as e:  # pylint: disable=broad-except
            logger.debug("did not 'number_of_results'")
            logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0)
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            results.append({'url': url, 'title': title, 'content': content})
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Beispiel #26
0
def load_engine(engine_data):

    if '_' in engine_data['name']:
        logger.error('Engine name conains underscore: "{}"'.format(engine_data['name']))
        sys.exit(1)

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = list(map(str.strip, engine_data['categories'].split(',')))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'
                         .format(engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in languages:
        setattr(engine, 'supported_languages', languages[engine_data['name']])

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        setattr(engine, 'fetch_supported_languages',
                lambda: engine._fetch_supported_languages(get(engine.supported_languages_url)))

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'page_load_count': 0,
        'engine_time': 0,
        'engine_time_count': 0,
        'score_count': 0,
        'errors': 0
    }

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Beispiel #27
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath_getindex(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):

        img_alt = eval_xpath_getindex(img_node, '@alt', 0)

        img_base64_id = eval_xpath(img_node, '@data-iid')
        if img_base64_id:
            img_base64_id = img_base64_id[0]
            thumbnail_src = img_bas64_map[img_base64_id]
        else:
            thumbnail_src = eval_xpath(img_node, '@src')
            if not thumbnail_src:
                thumbnail_src = eval_xpath(img_node, '@data-src')
            if thumbnail_src:
                thumbnail_src = thumbnail_src[0]
            else:
                thumbnail_src = ''

        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
        url = eval_xpath_getindex(link_node, '@href', 0)

        pub_nodes = eval_xpath(link_node, './div/div')
        pub_descr = img_alt
        pub_source = ''
        if pub_nodes:
            pub_descr = extract_text(pub_nodes[0])
            pub_source = extract_text(pub_nodes[1])

        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
        src_url = scrap_img_by_id(img_src_script, img_src_id)
        if not src_url:
            src_url = thumbnail_src

        results.append({
            'url': url,
            'title': img_alt,
            'content': pub_descr,
            'source': pub_source,
            'img_src': src_url,
            # 'img_format': img_format,
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    return results
Beispiel #28
0
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1
            # update stats with the total HTTP time
            engine.stats['page_load_time'] += requests_lib.get_time_for_thread()
            engine.stats['page_load_count'] += 1

    except Exception as e:
        search_duration = time() - start_time

        with threading.RLock():
            engine.stats['errors'] += 1

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine((engine_name, gettext('timeout')))
            # requests timeout (connect or read)
            logger.error("engine {0} : HTTP requests timeout"
                         "(search duration : {1} s, timeout: {2} s) : {3}"
                         .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine((engine_name, gettext('request exception')))
            # other requests exception
            logger.exception("engine {0} : requests exception"
                             "(search duration : {1} s, timeout: {2} s) : {3}"
                             .format(engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            result_container.add_unresponsive_engine((
                engine_name,
                u'{0}: {1}'.format(gettext('unexpected crash'), e),
            ))
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(engine_name, e))

    # suspend or not the engine if there are HTTP errors
    with threading.RLock():
        if requests_exception:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60, engine.continuous_errors)
        else:
            # no HTTP error (perhaps an engine error)
            # anyway, reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
Beispiel #29
0
def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError) as e:
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = list(
                    map(str.strip, engine_data['categories'].split(',')))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in languages:
        setattr(engine, 'supported_languages', languages[engine_data['name']])

    # find custom aliases for non standard language codes
    if hasattr(engine, 'supported_languages'):
        if hasattr(engine, 'language_aliases'):
            language_aliases = getattr(engine, 'language_aliases')
        else:
            language_aliases = {}

        for engine_lang in getattr(engine, 'supported_languages'):
            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
               iso_lang not in getattr(engine, 'supported_languages'):
                language_aliases[iso_lang] = engine_lang

        setattr(engine, 'language_aliases', language_aliases)

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        setattr(
            engine, 'fetch_supported_languages',
            lambda: engine._fetch_supported_languages(
                get(engine.supported_languages_url)))

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'engine_time': 0,
        'engine_time_count': 0,
        'score_count': 0,
        'errors': 0
    }

    if not engine.offline:
        engine.stats['page_load_time'] = 0
        engine.stats['page_load_count'] = 0

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Beispiel #30
0
def search_one_http_request_safe(engine_name, query, request_params,
                                 result_container, start_time, timeout_limit):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False

    try:
        # send requests and parse the results
        search_results = search_one_http_request(engine, query, request_params)

        # check if the engine accepted the request
        if search_results is not None:
            # yes, so add results
            result_container.extend(engine_name, search_results)

            # update engine time when there is no exception
            engine_time = time() - start_time
            page_load_time = requests_lib.get_time_for_thread()
            result_container.add_timing(engine_name, engine_time,
                                        page_load_time)
            with threading.RLock():
                engine.stats['engine_time'] += engine_time
                engine.stats['engine_time_count'] += 1
                # update stats with the total HTTP time
                engine.stats['page_load_time'] += page_load_time
                engine.stats['page_load_count'] += 1
    except Exception as e:
        record_exception(engine_name, e)

        # Timing
        engine_time = time() - start_time
        page_load_time = requests_lib.get_time_for_thread()
        result_container.add_timing(engine_name, engine_time, page_load_time)

        # Record the errors
        with threading.RLock():
            engine.stats['errors'] += 1

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine(engine_name,
                                                     'HTTP timeout')
            # requests timeout (connect or read)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit,
                    e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine(engine_name, 'HTTP error')
            # other requests exception
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, engine_time, timeout_limit, e))
            requests_exception = True
        elif (issubclass(e.__class__, SearxEngineCaptchaException)):
            result_container.add_unresponsive_engine(engine_name,
                                                     'CAPTCHA required')
            logger.exception('engine {0} : CAPTCHA')
        else:
            result_container.add_unresponsive_engine(engine_name,
                                                     'unexpected crash')
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(
                engine_name, e))
    else:
        if getattr(threading.current_thread(), '_timeout', False):
            record_error(engine_name, 'Timeout')

    # suspend or not the engine if there are HTTP errors
    with threading.RLock():
        if requests_exception:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(
                settings['search']['max_ban_time_on_fail'],
                engine.continuous_errors *
                settings['search']['ban_time_on_fail'])
        else:
            # no HTTP error (perhaps an engine error)
            # anyway, reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
Beispiel #31
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect":
        raise SearxEngineCaptchaException()

    if resp_url.path.startswith("/sorry"):
        raise SearxEngineCaptchaException()

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, "@alt")[0]

            img_base64_id = eval_xpath(img_node, "@data-iid")
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, "@src")
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, "@data-src")
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ""

            link_node = eval_xpath(img_node, "../../../a[2]")[0]
            url = eval_xpath(link_node, "@href")[0]

            pub_nodes = eval_xpath(link_node, "./div/div")
            pub_descr = img_alt
            pub_source = ""
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            img_src_id = eval_xpath(img_node, "../../../@data-id")[0]
            src_url = scrap_img_by_id(img_src_script, img_src_id)
            if not src_url:
                src_url = thumbnail_src

            results.append({
                "url": url,
                "title": img_alt,
                "content": pub_descr,
                "source": pub_source,
                "img_src": src_url,
                "img_format": {
                    "width": int(eval_xpath(img_node, "@width")[0]),
                    "height": int(eval_xpath(img_node, "@height")[0]),
                },
                "thumbnail_src": thumbnail_src,
                "template": "images.html",
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
Beispiel #32
0
    for engine in scores_per_result:
        if max_score_per_result:
            engine["percentage"] = int(engine["avg"] / max_score_per_result * 100)
        else:
            engine["percentage"] = 0

    for engine in errors:
        if max_errors:
            engine["percentage"] = int(float(engine["avg"]) / max_errors * 100)
        else:
            engine["percentage"] = 0

    return [
        (gettext("Page loads (sec)"), sorted(pageloads, key=itemgetter("avg"))),
        (gettext("Number of results"), sorted(results, key=itemgetter("avg"), reverse=True)),
        (gettext("Scores"), sorted(scores, key=itemgetter("avg"), reverse=True)),
        (gettext("Scores per result"), sorted(scores_per_result, key=itemgetter("avg"), reverse=True)),
        (gettext("Errors"), sorted(errors, key=itemgetter("avg"), reverse=True)),
    ]


if "engines" not in settings or not settings["engines"]:
    logger.error("No engines found. Edit your settings.yml")
    exit(2)

for engine_data in settings["engines"]:
    engine = load_engine(engine_data)
    if engine is not None:
        engines[engine.name] = engine
Beispiel #33
0
def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError):
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name, param_value in engine_data.items():
        if param_name == 'engine':
            pass
        elif param_name == 'categories':
            if param_value == 'none':
                engine.categories = []
            else:
                engine.categories = list(map(str.strip,
                                             param_value.split(',')))
        else:
            setattr(engine, param_name, param_value)

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in ENGINES_LANGUAGES:
        setattr(engine, 'supported_languages',
                ENGINES_LANGUAGES[engine_data['name']])

    # find custom aliases for non standard language codes
    if hasattr(engine, 'supported_languages'):
        if hasattr(engine, 'language_aliases'):
            language_aliases = getattr(engine, 'language_aliases')
        else:
            language_aliases = {}

        for engine_lang in getattr(engine, 'supported_languages'):
            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
               iso_lang not in getattr(engine, 'supported_languages'):
                language_aliases[iso_lang] = engine_lang

        setattr(engine, 'language_aliases', language_aliases)

    # language_support
    setattr(engine, 'language_support',
            len(getattr(engine, 'supported_languages', [])) > 0)

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        headers = {
            'User-Agent': gen_useragent(),
            'Accept-Language':
            'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3',  # bing needs a non-English language
        }
        setattr(
            engine, 'fetch_supported_languages',
            lambda: engine._fetch_supported_languages(
                get(engine.supported_languages_url, headers=headers)))

    # tor related settings
    if settings['outgoing'].get('using_tor_proxy'):
        # use onion url if using tor.
        if hasattr(engine, 'onion_url'):
            engine.search_url = engine.onion_url + getattr(
                engine, 'search_path', '')
    elif 'onions' in engine.categories:
        # exclude onion engines if not using tor.
        return None

    engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Beispiel #34
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the ling to the origin PNG, JPG or whatever is given
    #     (we do not blow out the link there, you could still implement that)
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, '@alt')[0]

            img_base64_id = eval_xpath(img_node, '@data-iid')
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, '@src')
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, '@data-src')
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ''

            link_node = eval_xpath(img_node, '../../../a[2]')[0]
            url = eval_xpath(link_node, '@href')[0]

            pub_nodes = eval_xpath(link_node, './div/div')
            pub_descr = img_alt
            pub_source = ''
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            results.append({
                'url': url,
                'title': img_alt,
                'content': pub_descr,
                'source': pub_source,
                'img_src': url,
                # 'img_format': img_format,
                'thumbnail_src': thumbnail_src,
                'template': 'images.html'
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
Beispiel #35
0
        (
            gettext('Page loads (sec)'),
            sorted(pageloads, key=itemgetter('avg'))
        ),
        (
            gettext('Number of results'),
            sorted(results, key=itemgetter('avg'), reverse=True)
        ),
        (
            gettext('Scores'),
            sorted(scores, key=itemgetter('avg'), reverse=True)
        ),
        (
            gettext('Scores per result'),
            sorted(scores_per_result, key=itemgetter('avg'), reverse=True)
        ),
        (
            gettext('Errors'),
            sorted(errors, key=itemgetter('avg'), reverse=True)
        ),
    ]


if 'engines' not in settings or not settings['engines']:
    logger.error('No engines found. Edit your settings.yml')
    exit(2)

for engine_data in settings['engines']:
    engine = load_engine(engine_data)
    engines[engine.name] = engine
Beispiel #36
0
def load_engine(engine_data):
    engine_name = engine_data['engine']
    engine = load_module(engine_name + '.py')

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = map(
                    str.strip, engine_data['categories'].split(','))
            continue
        setattr(engine, param_name, engine_data[param_name])

    if not hasattr(engine, 'paging'):
        engine.paging = False

    if not hasattr(engine, 'categories'):
        engine.categories = ['general']

    if not hasattr(engine, 'language_support'):
        engine.language_support = True

    if not hasattr(engine, 'timeout'):
        engine.timeout = settings['server']['request_timeout']

    if not hasattr(engine, 'shortcut'):
        engine.shortcut = ''

    if not hasattr(engine, 'disabled'):
        engine.disabled = False

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'
                         .format(engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'score_count': 0,
        'errors': 0
    }

    if hasattr(engine, 'categories'):
        for category_name in engine.categories:
            categories.setdefault(category_name, []).append(engine)
    else:
        categories['general'].append(engine)

    if engine.shortcut:
        if engine.shortcut in engine_shortcuts:
            logger.error('Engine config error: ambigious shortcut: {0}'
                         .format(engine.shortcut))
            sys.exit(1)
        engine_shortcuts[engine.shortcut] = engine.name
    return engine
Beispiel #37
0
def load_engine(engine_data):
    engine_name = engine_data['engine']
    engine = load_module(engine_name + '.py')

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = map(str.strip,
                                        engine_data['categories'].split(','))
            continue
        setattr(engine, param_name, engine_data[param_name])

    if not hasattr(engine, 'paging'):
        engine.paging = False

    if not hasattr(engine, 'categories'):
        engine.categories = ['general']

    if not hasattr(engine, 'language_support'):
        engine.language_support = True

    if not hasattr(engine, 'timeout'):
        engine.timeout = settings['server']['request_timeout']

    if not hasattr(engine, 'shortcut'):
        engine.shortcut = ''

    if not hasattr(engine, 'disabled'):
        engine.disabled = False

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'score_count': 0,
        'errors': 0
    }

    if hasattr(engine, 'categories'):
        for category_name in engine.categories:
            categories.setdefault(category_name, []).append(engine)
    else:
        categories['general'].append(engine)

    if engine.shortcut:
        if engine.shortcut in engine_shortcuts:
            logger.error('Engine config error: ambigious shortcut: {0}'.format(
                engine.shortcut))
            sys.exit(1)
        engine_shortcuts[engine.shortcut] = engine.name
    return engine
Beispiel #38
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
        answer_list = [_.xpath("normalize-space()") for _ in answer_list]
        results.append({'answer': ' '.join(answer_list)})
    else:
        logger.debug("did not find 'answer'")

        # results --> number_of_results
        if not use_mobile_ui:
            try:
                _txt = eval_xpath_getindex(
                    dom, '//div[@id="result-stats"]//text()', 0)
                _digit = ''.join([n for n in _txt if n.isdigit()])
                number_of_results = int(_digit)
                results.append({'number_of_results': number_of_results})
            except Exception as e:  # pylint: disable=broad-except
                logger.debug("did not 'number_of_results'")
                logger.error(e, exc_info=True)

    # parse results

    _results_xpath = results_xpath
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring item from the result_xpath list: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            if content is None:
                logger.debug(
                    'ingoring item from the result_xpath list: missing content of title "%s"',
                    title)
                continue

            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results