Example #1
0
def search_one_offline_request_safe(engine_name, query, request_params,
                                    result_container, start_time,
                                    timeout_limit):
    engine = engines[engine_name]

    try:
        search_results = search_one_offline_request(engine, query,
                                                    request_params)

        if search_results:
            result_container.extend(engine_name, search_results)

            engine_time = time() - start_time
            result_container.add_timing(engine_name, engine_time, engine_time)
            with threading.RLock():
                engine.stats['engine_time'] += engine_time
                engine.stats['engine_time_count'] += 1

    except ValueError as e:
        record_offline_engine_stats_on_error(engine, result_container,
                                             start_time)
        logger.exception('engine {0} : invalid input : {1}'.format(
            engine_name, e))
    except Exception as e:
        record_offline_engine_stats_on_error(engine, result_container,
                                             start_time)
        result_container.add_unresponsive_engine(engine_name,
                                                 'unexpected crash', str(e))
        logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
Example #2
0
def search_one_offline_request_safe(
    engine_name, query, request_params, result_container, start_time, timeout_limit
):
    engine = engines[engine_name]

    try:
        search_results = search_one_offline_request(engine, query, request_params)

        if search_results:
            result_container.extend(engine_name, search_results)

            engine_time = time() - start_time
            result_container.add_timing(engine_name, engine_time, engine_time)
            with threading.RLock():
                engine.stats["engine_time"] += engine_time
                engine.stats["engine_time_count"] += 1

    except ValueError as e:
        record_exception(engine_name, e)
        record_offline_engine_stats_on_error(engine, result_container, start_time)
        logger.exception("engine {0} : invalid input : {1}".format(engine_name, e))
    except Exception as e:
        record_exception(engine_name, e)
        record_offline_engine_stats_on_error(engine, result_container, start_time)
        result_container.add_unresponsive_engine(
            engine_name, "unexpected crash", str(e)
        )
        logger.exception("engine {0} : exception : {1}".format(engine_name, e))
    else:
        if getattr(threading.current_thread(), "_timeout", False):
            record_error(engine_name, "Timeout")
Example #3
0
def pre_request():
    request.errors = []

    preferences = Preferences(themes, list(categories.keys()), engines, plugins)
    request.preferences = preferences
    try:
        preferences.parse_dict(request.cookies)
    except Exception:
        request.errors.append(gettext('Invalid settings, please edit your preferences'))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    if request.form.get('preferences'):
        preferences.parse_encoded_data(request.form['preferences'])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception as e:
            logger.exception('invalid settings')
            request.errors.append(gettext('Invalid settings'))

    # request.user_plugins
    request.user_plugins = []
    allowed_plugins = preferences.plugins.get_enabled()
    disabled_plugins = preferences.plugins.get_disabled()
    for plugin in plugins:
        if ((plugin.default_on and plugin.id not in disabled_plugins)
                or plugin.id in allowed_plugins):
            request.user_plugins.append(plugin)
Example #4
0
def load_engine(engine_data):

    if '_' in engine_data['name']:
        logger.error('Engine name conains underscore: "{}"'.format(engine_data['name']))
        sys.exit(1)

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = map(
                    str.strip, engine_data['categories'].split(','))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.iteritems():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'
                         .format(engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'page_load_count': 0,
        'engine_time': 0,
        'engine_time_count': 0,
        'score_count': 0,
        'errors': 0
    }

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Example #5
0
 def engine_init(engine_name, init_fn):
     try:
         init_fn(get_engine_from_settings(engine_name))
     except Exception:
         logger.exception('%s engine: Fail to initialize', engine_name)
     else:
         logger.debug('%s engine: Initialized', engine_name)
Example #6
0
def pre_request():
    request.start_time = time()
    request.timings = []
    request.errors = []

    preferences = Preferences(themes, list(categories.keys()), engines)
    request.preferences = preferences
    try:
        preferences.parse_dict(request.cookies)
    except:
        request.errors.append(gettext('Invalid settings, please edit your preferences'))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    if request.form.get('preferences'):
        preferences.parse_encoded_data(request.form['preferences'])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception as e:
            logger.exception('invalid settings')
            request.errors.append(gettext('Invalid settings'))
Example #7
0
def run():
    if not running.acquire(blocking=False):
        return
    try:
        logger.info('Starting checker')
        result = {'status': 'ok', 'engines': {}}
        for name, processor in processors.items():
            logger.debug('Checking %s engine', name)
            checker = Checker(processor)
            checker.run()
            if checker.test_results.succesfull:
                result['engines'][name] = {'success': True}
            else:
                result['engines'][name] = {
                    'success': False,
                    'errors': checker.test_results.errors
                }

        _set_result(result)
        logger.info('Check done')
    except Exception:
        _set_result({'status': 'error'})
        logger.exception('Error while running the checker')
    finally:
        running.release()
Example #8
0
def pre_request():
    request.errors = []

    preferences = Preferences(themes, list(categories.keys()), engines, plugins)
    request.preferences = preferences
    try:
        preferences.parse_dict(request.cookies)
    except:
        request.errors.append(gettext('Invalid settings, please edit your preferences'))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    if request.form.get('preferences'):
        preferences.parse_encoded_data(request.form['preferences'])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception as e:
            logger.exception('invalid settings')
            request.errors.append(gettext('Invalid settings'))

    # request.user_plugins
    request.user_plugins = []
    allowed_plugins = preferences.plugins.get_enabled()
    disabled_plugins = preferences.plugins.get_disabled()
    for plugin in plugins:
        if ((plugin.default_on and plugin.id not in disabled_plugins)
                or plugin.id in allowed_plugins):
            request.user_plugins.append(plugin)
def search_one_request_safe(engine_name, query, request_params,
                            result_container, start_time, timeout_limit):
    engine = engines[engine_name]

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params,
                                            start_time, timeout_limit)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1

        return True

    except Exception as e:
        engine.stats['errors'] += 1

        search_duration = time() - start_time
        requests_exception = False

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine(
                (engine_name, gettext('timeout')))
            # requests timeout (connect or read)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, search_duration, timeout_limit,
                    e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine(
                (engine_name, gettext('request exception')))
            # other requests exception
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            result_container.add_unresponsive_engine(
                (engine_name, gettext('unexpected crash')))
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(
                engine_name, e))

        # update continuous_errors / suspend_end_time
        if requests_exception:
            with threading.RLock():
                engine.continuous_errors += 1
                engine.suspend_end_time = time() + min(
                    60, engine.continuous_errors)

        #
        return False
Example #10
0
    def search(self, query, params, result_container, start_time,
               timeout_limit):
        try:
            search_results = self._search_basic(query, params)

            if search_results:
                result_container.extend(self.engine_name, search_results)

                engine_time = time() - start_time
                result_container.add_timing(self.engine_name, engine_time,
                                            engine_time)
                with threading.RLock():
                    self.engine.stats['engine_time'] += engine_time
                    self.engine.stats['engine_time_count'] += 1

        except ValueError as e:
            record_exception(self.engine_name, e)
            self._record_stats_on_error(result_container, start_time)
            logger.exception('engine {0} : invalid input : {1}'.format(
                self.engine_name, e))
        except Exception as e:
            record_exception(self.engine_name, e)
            self._record_stats_on_error(result_container, start_time)
            result_container.add_unresponsive_engine(self.engine_name,
                                                     'unexpected crash',
                                                     str(e))
            logger.exception('engine {0} : exception : {1}'.format(
                self.engine_name, e))
        else:
            if getattr(threading.current_thread(), '_timeout', False):
                record_error(self.engine_name, 'Timeout')
Example #11
0
def load_engine(engine_data):
    """Load engine from ``engine_data``.

    :param dict engine_data:  Attributes from YAML ``settings:engines/<engine>``
    :return: initialized namespace of the ``<engine>``.

    1. create a namespace and load module of the ``<engine>``
    2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
    3. update namespace with values from ``engine_data``

    If engine *is active*, return namespace of the engine, otherwise return
    ``None``.

    This function also returns ``None`` if initialization of the namespace fails
    for one of the following reasons:

    - engine name contains underscore
    - engine name is not lowercase
    - required attribute is not set :py:func:`is_missing_required_attributes`

    """

    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        return None

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    # load_module
    engine_module = engine_data['engine']
    try:
        engine = load_module(engine_module + '.py', ENGINE_DIR)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError):
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except BaseException:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    update_engine_attributes(engine, engine_data)
    set_language_attributes(engine)
    update_attributes_for_tor(engine)

    if not is_engine_active(engine):
        return None

    if is_missing_required_attributes(engine):
        return None

    return engine
Example #12
0
def load_engine(engine_data):

    if '_' in engine_data['name']:
        logger.error('Engine name conains underscore: "{}"'.format(
            engine_data['name']))
        sys.exit(1)

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py')
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = map(str.strip,
                                        engine_data['categories'].split(','))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.iteritems():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'score_count': 0,
        'errors': 0
    }

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Example #13
0
File: search.py Project: wflk/searx
def send_http_request(engine, request_params, timeout_limit):
    response = None
    try:
        # create dictionary which contain all
        # informations about the request
        request_args = dict(headers=request_params['headers'],
                            cookies=request_params['cookies'],
                            timeout=timeout_limit,
                            verify=request_params['verify'])
        # specific type of request (GET or POST)
        if request_params['method'] == 'GET':
            req = requests_lib.get
        else:
            req = requests_lib.post
            request_args['data'] = request_params['data']

        # for page_load_time stats
        time_before_request = time()

        # send the request
        response = req(request_params['url'], **request_args)

        with threading.RLock():
            # no error : reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
            # update stats with current page-load-time
            # only the HTTP request
            engine.stats['page_load_time'] += time() - time_before_request
            engine.stats['page_load_count'] += 1

        # is there a timeout (no parsing in this case)
        timeout_overhead = 0.2  # seconds
        search_duration = time() - request_params['started']
        if search_duration > timeout_limit + timeout_overhead:
            logger.exception(
                'engine timeout on HTTP request:'
                '{0} (search duration : {1} ms, time-out: {2} )'.format(
                    engine.name, search_duration, timeout_limit))
            with threading.RLock():
                engine.stats['errors'] += 1
            return False

        # everything is ok : return the response
        return response

    except:
        # increase errors stats
        with threading.RLock():
            engine.stats['errors'] += 1
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60,
                                                   engine.continuous_errors)

        # print engine name and specific error message
        logger.exception('engine crash: {0}'.format(engine.name))
        return False
Example #14
0
def send_http_request(engine, request_params, timeout_limit):
    response = None
    try:
        # create dictionary which contain all
        # informations about the request
        request_args = dict(
            headers=request_params['headers'],
            cookies=request_params['cookies'],
            timeout=timeout_limit,
            verify=request_params['verify']
        )
        # specific type of request (GET or POST)
        if request_params['method'] == 'GET':
            req = requests_lib.get
        else:
            req = requests_lib.post
            request_args['data'] = request_params['data']

        # for page_load_time stats
        time_before_request = time()

        # send the request
        response = req(request_params['url'], **request_args)

        with threading.RLock():
            # no error : reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
            # update stats with current page-load-time
            # only the HTTP request
            engine.stats['page_load_time'] += time() - time_before_request
            engine.stats['page_load_count'] += 1

        # is there a timeout (no parsing in this case)
        timeout_overhead = 0.2  # seconds
        search_duration = time() - request_params['started']
        if search_duration > timeout_limit + timeout_overhead:
            logger.exception('engine timeout on HTTP request:'
                             '{0} (search duration : {1} ms, time-out: {2} )'
                             .format(engine.name, search_duration, timeout_limit))
            with threading.RLock():
                engine.stats['errors'] += 1
            return False

        # everything is ok : return the response
        return response

    except:
        # increase errors stats
        with threading.RLock():
            engine.stats['errors'] += 1
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60, engine.continuous_errors)

        # print engine name and specific error message
        logger.exception('engine crash: {0}'.format(engine.name))
        return False
Example #15
0
def search_request_wrapper(fn, url, engine_name, **kwargs):
    try:
        return fn(url, **kwargs)
    except:
        # increase errors stats
        engines[engine_name].stats['errors'] += 1

        # print engine name and specific error message
        logger.exception('engine crash: {0}'.format(engine_name))
        return
Example #16
0
 def engine_init(engine_name, init_fn):
     try:
         init_fn(get_engine_from_settings(engine_name))
     except SearxEngineResponseException as exc:
         logger.warn('%s engine: Fail to initialize // %s', engine_name,
                     exc)
     except Exception:
         logger.exception('%s engine: Fail to initialize', engine_name)
     else:
         logger.debug('%s engine: Initialized', engine_name)
Example #17
0
 def initialize(self):
     try:
         self.engine.init(get_engine_from_settings(self.engine_name))
     except SearxEngineResponseException as exc:
         logger.warn('%s engine: Fail to initialize // %s',
                     self.engine_name, exc)
     except Exception:  # pylint: disable=broad-except
         logger.exception('%s engine: Fail to initialize', self.engine_name)
     else:
         logger.debug('%s engine: Initialized', self.engine_name)
Example #18
0
def search_request_wrapper(fn, url, engine_name, **kwargs):
    try:
        return fn(url, **kwargs)
    except:
        # increase errors stats
        engines[engine_name].stats['errors'] += 1

        # print engine name and specific error message
        logger.exception('engine crash: {0}'.format(engine_name))
        return
Example #19
0
def pre_request():
    request.start_time = default_timer()  # pylint: disable=assigning-non-slot
    request.render_time = 0  # pylint: disable=assigning-non-slot
    request.timings = []  # pylint: disable=assigning-non-slot
    request.errors = []  # pylint: disable=assigning-non-slot

    preferences = Preferences(themes, list(categories.keys()), engines,
                              plugins)  # pylint: disable=redefined-outer-name
    user_agent = request.headers.get('User-Agent', '').lower()
    if 'webkit' in user_agent and 'android' in user_agent:
        preferences.key_value_settings['method'].value = 'GET'
    request.preferences = preferences  # pylint: disable=assigning-non-slot

    try:
        preferences.parse_dict(request.cookies)

    except Exception as e:  # pylint: disable=broad-except
        logger.exception(e, exc_info=True)
        request.errors.append(
            gettext('Invalid settings, please edit your preferences'))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())  # pylint: disable=assigning-non-slot
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    if request.form.get('preferences'):
        preferences.parse_encoded_data(request.form['preferences'])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception as e:  # pylint: disable=broad-except
            logger.exception(e, exc_info=True)
            request.errors.append(gettext('Invalid settings'))

    # init search language and locale
    if not preferences.get_value("language"):
        preferences.parse_dict({
            "language":
            _get_browser_or_settings_language(request, LANGUAGE_CODES)
        })
    if not preferences.get_value("locale"):
        preferences.parse_dict({"locale": get_locale()})

    # request.user_plugins
    request.user_plugins = []  # pylint: disable=assigning-non-slot
    allowed_plugins = preferences.plugins.get_enabled()
    disabled_plugins = preferences.plugins.get_disabled()
    for plugin in plugins:
        if ((plugin.default_on and plugin.id not in disabled_plugins)
                or plugin.id in allowed_plugins):
            request.user_plugins.append(plugin)
Example #20
0
 def search(self, query, params, result_container, start_time,
            timeout_limit):
     try:
         search_results = self._search_basic(query, params)
         self.extend_container(result_container, start_time, search_results)
     except ValueError as e:
         # do not record the error
         logger.exception('engine {0} : invalid input : {1}'.format(
             self.engine_name, e))
     except Exception as e:  # pylint: disable=broad-except
         self.handle_exception(result_container, e)
         logger.exception('engine {0} : exception : {1}'.format(
             self.engine_name, e))
Example #21
0
def pre_request():
    request.start_time = time()
    request.timings = []
    request.errors = []

    preferences = Preferences(themes, list(categories.keys()), engines,
                              plugins)
    user_agent = request.headers.get("User-Agent", "").lower()
    if "webkit" in user_agent and "android" in user_agent:
        preferences.key_value_settings["method"].value = "GET"
    request.preferences = preferences
    try:
        preferences.parse_dict(request.cookies)
    except:
        request.errors.append(
            gettext("Invalid settings, please edit your preferences"))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    if request.form.get("preferences"):
        preferences.parse_encoded_data(request.form["preferences"])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception:
            logger.exception("invalid settings")
            request.errors.append(gettext("Invalid settings"))

    # init search language and locale
    if not preferences.get_value("language"):
        preferences.parse_dict({
            "language":
            _get_browser_or_settings_language(request, LANGUAGE_CODES)
        })
    if not preferences.get_value("locale"):
        preferences.parse_dict({"locale": get_locale()})

    # request.user_plugins
    request.user_plugins = []
    allowed_plugins = preferences.plugins.get_enabled()
    disabled_plugins = preferences.plugins.get_disabled()
    for plugin in plugins:
        if (plugin.default_on and plugin.id
                not in disabled_plugins) or plugin.id in allowed_plugins:
            request.user_plugins.append(plugin)
Example #22
0
def load_engine(engine_data):

    if "_" in engine_data["name"]:
        logger.error('Engine name conains underscore: "{}"'.format(engine_data["name"]))
        sys.exit(1)

    engine_module = engine_data["engine"]

    try:
        engine = load_module(engine_module + ".py")
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == "engine":
            continue
        if param_name == "categories":
            if engine_data["categories"] == "none":
                engine.categories = []
            else:
                engine.categories = map(str.strip, engine_data["categories"].split(","))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.iteritems():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith("_"):
            continue
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(engine.name, engine_attr))
            sys.exit(1)

    engine.stats = {"result_count": 0, "search_count": 0, "page_load_time": 0, "score_count": 0, "errors": 0}

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error("Engine config error: ambigious shortcut: {0}".format(engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Example #23
0
def code_highlighter(codelines, language=None):
    if not language:
        language = 'text'

    try:
        # find lexer by programing language
        lexer = get_lexer_by_name(language, stripall=True)

    except Exception as e:  # pylint: disable=broad-except
        logger.exception(e, exc_info=True)
        # if lexer is not found, using default one
        lexer = get_lexer_by_name('text', stripall=True)

    html_code = ''
    tmp_code = ''
    last_line = None

    # parse lines
    for line, code in codelines:
        if not last_line:
            line_code_start = line

        # new codeblock is detected
        if last_line is not None and\
           last_line + 1 != line:

            # highlight last codepart
            formatter = HtmlFormatter(linenos='inline',
                                      linenostart=line_code_start,
                                      cssclass="code-highlight")
            html_code = html_code + highlight(tmp_code, lexer, formatter)

            # reset conditions for next codepart
            tmp_code = ''
            line_code_start = line

        # add codepart
        tmp_code += code + '\n'

        # update line
        last_line = line

    # highlight last codepart
    formatter = HtmlFormatter(linenos='inline',
                              linenostart=line_code_start,
                              cssclass="code-highlight")
    html_code = html_code + highlight(tmp_code, lexer, formatter)

    return html_code
Example #24
0
def response(resp):
    dom = html.fromstring(resp.text)
    results = []

    for result in dom.xpath(results_xpath):
        try:
            res = {'url': result.xpath(url_xpath)[0],
                   'title': ''.join(result.xpath(title_xpath)),
                   'content': ''.join(result.xpath(content_xpath))}
        except:
            logger.exception('yandex parse crash')
            continue

        results.append(res)

    return results
Example #25
0
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
    engine = engines[engine_name]

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params, start_time, timeout_limit)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1

        return True

    except Exception as e:
        engine.stats['errors'] += 1

        search_duration = time() - start_time
        requests_exception = False

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            # requests timeout (connect or read)
            logger.error("engine {0} : HTTP requests timeout"
                         "(search duration : {1} s, timeout: {2} s) : {3}"
                         .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            # other requests exception
            logger.exception("engine {0} : requests exception"
                             "(search duration : {1} s, timeout: {2} s) : {3}"
                             .format(engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(engine_name, e))

        # update continuous_errors / suspend_end_time
        if requests_exception:
            with threading.RLock():
                engine.continuous_errors += 1
                engine.suspend_end_time = time() + min(60, engine.continuous_errors)

        #
        return False
Example #26
0
    def search(self, query, params, result_container, start_time,
               timeout_limit):
        # set timeout for all HTTP requests
        searx.network.set_timeout_for_thread(timeout_limit,
                                             start_time=start_time)
        # reset the HTTP total time
        searx.network.reset_time_for_thread()
        # set the network
        searx.network.set_context_network_name(self.engine_name)

        try:
            # send requests and parse the results
            search_results = self._search_basic(query, params)
            self.extend_container(result_container, start_time, search_results)
        except (httpx.TimeoutException, asyncio.TimeoutError) as e:
            # requests timeout (connect or read)
            self.handle_exception(result_container, e, suspend=True)
            logger.error(
                "engine {0} : HTTP requests timeout"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    self.engine_name,
                    default_timer() - start_time, timeout_limit,
                    e.__class__.__name__))
        except (httpx.HTTPError, httpx.StreamError) as e:
            # other requests exception
            self.handle_exception(result_container, e, suspend=True)
            logger.exception(
                "engine {0} : requests exception"
                "(search duration : {1} s, timeout: {2} s) : {3}".format(
                    self.engine_name,
                    default_timer() - start_time, timeout_limit, e))
        except SearxEngineCaptchaException as e:
            self.handle_exception(result_container, e, suspend=True)
            logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
        except SearxEngineTooManyRequestsException as e:
            self.handle_exception(result_container, e, suspend=True)
            logger.exception('engine {0} : Too many requests'.format(
                self.engine_name))
        except SearxEngineAccessDeniedException as e:
            self.handle_exception(result_container, e, suspend=True)
            logger.exception('engine {0} : Searx is blocked'.format(
                self.engine_name))
        except Exception as e:  # pylint: disable=broad-except
            self.handle_exception(result_container, e)
            logger.exception('engine {0} : exception : {1}'.format(
                self.engine_name, e))
Example #27
0
File: search.py Project: wflk/searx
def search_one_request(engine_name, query, request_params, result_container,
                       timeout_limit):
    engine = engines[engine_name]

    # update request parameters dependent on
    # search-engine (contained in engines folder)
    engine.request(query, request_params)

    # TODO add support of offline engines
    if request_params['url'] is None:
        return False

    # ignoring empty urls
    if not request_params['url']:
        return False

    # send request
    response = send_http_request(engine, request_params, timeout_limit)

    # parse response
    success = None
    if response:
        # parse the response
        response.search_params = request_params
        try:
            search_results = engine.response(response)
        except:
            logger.exception('engine crash: {0}'.format(engine.name))
            search_results = []

        # add results
        for result in search_results:
            result['engine'] = engine.name

        result_container.extend(engine.name, search_results)

        success = True
    else:
        success = False

    with threading.RLock():
        # update stats : total time
        engine.stats['engine_time'] += time() - request_params['started']
        engine.stats['engine_time_count'] += 1

    return success
Example #28
0
def pre_request():
    request.start_time = time()
    request.timings = []
    request.errors = []

    preferences = Preferences(themes, list(categories.keys()), engines,
                              plugins)
    request.preferences = preferences
    try:
        preferences.parse_dict(request.cookies)
    except:
        request.errors.append(
            gettext('Invalid settings, please edit your preferences'))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    if request.form.get('preferences'):
        preferences.parse_encoded_data(request.form['preferences'])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception as e:
            logger.exception('invalid settings')
            request.errors.append(gettext('Invalid settings'))

    # init search language and locale
    if not preferences.get_value("language"):
        preferences.parse_dict(
            {"language": _get_browser_language(request, LANGUAGE_CODES)})
    if not preferences.get_value("locale"):
        preferences.parse_dict({"locale": get_locale()})

    # request.user_plugins
    request.user_plugins = []
    allowed_plugins = preferences.plugins.get_enabled()
    disabled_plugins = preferences.plugins.get_disabled()
    for plugin in plugins:
        if ((plugin.default_on and plugin.id not in disabled_plugins)
                or plugin.id in allowed_plugins):
            request.user_plugins.append(plugin)
Example #29
0
def search_one_request(engine_name, query, request_params, result_container, timeout_limit):
    engine = engines[engine_name]

    # update request parameters dependent on
    # search-engine (contained in engines folder)
    engine.request(query, request_params)

    # TODO add support of offline engines
    if request_params['url'] is None:
        return False

    # ignoring empty urls
    if not request_params['url']:
        return False

    # send request
    response = send_http_request(engine, request_params, timeout_limit)

    # parse response
    success = None
    if response:
        # parse the response
        response.search_params = request_params
        try:
            search_results = engine.response(response)
        except:
            logger.exception('engine crash: {0}'.format(engine.name))
            search_results = []

        # add results
        for result in search_results:
            result['engine'] = engine.name

        result_container.extend(engine.name, search_results)

        success = True
    else:
        success = False

    with threading.RLock():
        # update stats : total time
        engine.stats['engine_time'] += time() - request_params['started']
        engine.stats['engine_time_count'] += 1

    return success
Example #30
0
def _is_url_image(image_url):
    if not isinstance(image_url, str):
        return False

    if image_url.startswith('//'):
        image_url = 'https:' + image_url

    if image_url.startswith('data:'):
        return image_url.startswith('data:image/')

    if not _is_url(image_url):
        return False

    retry = 2

    while retry > 0:
        a = time()
        try:
            network.set_timeout_for_thread(10.0, time())
            r = network.get(
                image_url,
                timeout=10.0,
                allow_redirects=True,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Language': 'en-US;q=0.5,en;q=0.3',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'DNT': '1',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1',
                    'Sec-GPC': '1',
                    'Cache-Control': 'max-age=0'
                })
            if r.headers["content-type"].startswith('image/'):
                return True
            return False
        except httpx.TimeoutException:
            logger.error('Timeout for %s: %i', image_url, int(time() - a))
            retry -= 1
        except httpx.HTTPError:
            logger.exception('Exception for %s', image_url)
            return False
Example #31
0
def search_request_wrapper(fn, url, engine_name, **kwargs):
    ret = None
    engine = engines[engine_name]
    try:
        ret = fn(url, **kwargs)
        with threading.RLock():
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
    except:
        # increase errors stats
        with threading.RLock():
            engine.stats['errors'] += 1
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60, engine.continuous_errors)

        # print engine name and specific error message
        logger.exception('engine crash: {0}'.format(engine_name))
    return ret
Example #32
0
def response(resp):
    dom = html.fromstring(resp.text)
    results = []

    for result in dom.xpath(results_xpath):
        try:
            res = {
                'url': result.xpath(url_xpath)[0],
                'title': ''.join(result.xpath(title_xpath)),
                'content': ''.join(result.xpath(content_xpath))
            }
        except:
            logger.exception('yandex parse crash')
            continue

        results.append(res)

    return results
Example #33
0
def search_request_wrapper(fn, url, engine_name, **kwargs):
    ret = None
    engine = engines[engine_name]
    try:
        ret = fn(url, **kwargs)
        with threading.RLock():
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
    except:
        # increase errors stats
        with threading.RLock():
            engine.stats['errors'] += 1
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60,
                                                   engine.continuous_errors)

        # print engine name and specific error message
        logger.exception('engine crash: {0}'.format(engine_name))
    return ret
Example #34
0
def pre_request():
    request.start_time = time()
    request.timings = []
    request.errors = []

    preferences = Preferences(themes, list(categories.keys()), engines, plugins)
    request.preferences = preferences
    try:
        preferences.parse_dict(request.cookies)
    except:
        request.errors.append(gettext('Invalid settings, please edit your preferences'))

    # merge GET, POST vars
    # request.form
    request.form = dict(request.form.items())
    for k, v in request.args.items():
        if k not in request.form:
            request.form[k] = v

    # todo 缓存机制
    # 关键词累积加一
    kr.keyNum(request.form['q'])
    # 查询form是否存在,存在将结果直接返回
    if request.form.get('research') == '0' and kr.existsForm(json.dumps(request.form)):
        request.form['Result'] = kr.getResult(json.dumps(request.form))

    if request.form.get('preferences'):
        preferences.parse_encoded_data(request.form['preferences'])
    else:
        try:
            preferences.parse_dict(request.form)
        except Exception as e:
            logger.exception('invalid settings')
            request.errors.append(gettext('Invalid settings'))

    # request.user_plugins
    request.user_plugins = []
    allowed_plugins = preferences.plugins.get_enabled()
    disabled_plugins = preferences.plugins.get_disabled()
    for plugin in plugins:
        if ((plugin.default_on and plugin.id not in disabled_plugins)
                or plugin.id in allowed_plugins):
            request.user_plugins.append(plugin)
Example #35
0
def response(resp):
    dom = html.fromstring(resp.text)
    results = []

    for result in range(10):
        try:
            res = {
                'url': dom.xpath(url_xpath)[result],
                'title': ''.join(dom.xpath(title_xpath)[result]),
                'content': ''.join(dom.xpath(content_xpath)[result])
            }
        except:
            logger.exception('bidb parse crash')

            continue

        results.append(res)

    return results
Example #36
0
def response(resp):
    resp_url = urlparse(resp.url)
    if resp_url.path.startswith('/showcaptcha'):
        raise SearxEngineCaptchaException()

    dom = html.fromstring(resp.text)
    results = []

    for result in dom.xpath(results_xpath):
        try:
            res = {'url': result.xpath(url_xpath)[0],
                   'title': ''.join(result.xpath(title_xpath)),
                   'content': ''.join(result.xpath(content_xpath))}
        except:
            logger.exception('yandex parse crash')
            continue

        results.append(res)

    return results
Example #37
0
def load_engine(engine_data):

    if '_' in engine_data['name']:
        logger.error('Engine name conains underscore: "{}"'.format(engine_data['name']))
        sys.exit(1)

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = list(map(str.strip, engine_data['categories'].split(',')))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'
                         .format(engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in languages:
        setattr(engine, 'supported_languages', languages[engine_data['name']])

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        setattr(engine, 'fetch_supported_languages',
                lambda: engine._fetch_supported_languages(get(engine.supported_languages_url)))

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'page_load_time': 0,
        'page_load_count': 0,
        'engine_time': 0,
        'engine_time_count': 0,
        'score_count': 0,
        'errors': 0
    }

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Example #38
0
def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError):
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name, param_value in engine_data.items():
        if param_name == 'engine':
            pass
        elif param_name == 'categories':
            if param_value == 'none':
                engine.categories = []
            else:
                engine.categories = list(map(str.strip,
                                             param_value.split(',')))
        else:
            setattr(engine, param_name, param_value)

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in ENGINES_LANGUAGES:
        setattr(engine, 'supported_languages',
                ENGINES_LANGUAGES[engine_data['name']])

    # find custom aliases for non standard language codes
    if hasattr(engine, 'supported_languages'):
        if hasattr(engine, 'language_aliases'):
            language_aliases = getattr(engine, 'language_aliases')
        else:
            language_aliases = {}

        for engine_lang in getattr(engine, 'supported_languages'):
            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
               iso_lang not in getattr(engine, 'supported_languages'):
                language_aliases[iso_lang] = engine_lang

        setattr(engine, 'language_aliases', language_aliases)

    # language_support
    setattr(engine, 'language_support',
            len(getattr(engine, 'supported_languages', [])) > 0)

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        headers = {
            'User-Agent': gen_useragent(),
            'Accept-Language':
            'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3',  # bing needs a non-English language
        }
        setattr(
            engine, 'fetch_supported_languages',
            lambda: engine._fetch_supported_languages(
                get(engine.supported_languages_url, headers=headers)))

    # tor related settings
    if settings['outgoing'].get('using_tor_proxy'):
        # use onion url if using tor.
        if hasattr(engine, 'onion_url'):
            engine.search_url = engine.onion_url + getattr(
                engine, 'search_path', '')
    elif 'onions' in engine.categories:
        # exclude onion engines if not using tor.
        return None

    engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Example #39
0
def index():
    """Render index page.

    Supported outputs: html, json, csv, rss.
    """

    # output_format
    output_format = request.form.get('format', 'html')
    if output_format not in ['html', 'csv', 'json', 'rss']:
        output_format = 'html'

    # check if there is query
    if request.form.get('q') is None:
        if output_format == 'html':
            return render(
                'index.html',
            )
        else:
            return index_error(output_format, 'No query'), 400

    # search
    search_query = None
    result_container = None
    try:
        search_query = get_search_query_from_webapp(request.preferences, request.form)
        # search = Search(search_query) #  without plugins
        search = SearchWithPlugins(search_query, request.user_plugins, request)
        result_container = search.search()
    except Exception as e:
        # log exception
        logger.exception('search error')

        # is it an invalid input parameter or something else ?
        if (issubclass(e.__class__, SearxParameterException)):
            return index_error(output_format, e.message), 400
        else:
            return index_error(output_format, gettext('search error')), 500

    # results
    results = result_container.get_ordered_results()
    number_of_results = result_container.results_number()
    if number_of_results < result_container.results_length():
        number_of_results = 0

    # UI
    advanced_search = request.form.get('advanced_search', None)

    # output
    for result in results:
        if output_format == 'html':
            if 'content' in result and result['content']:
                result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
            result['title'] = highlight_content(escape(result['title'] or u''), search_query.query)
        else:
            if result.get('content'):
                result['content'] = html_to_text(result['content']).strip()
            # removing html content and whitespace duplications
            result['title'] = ' '.join(html_to_text(result['title']).strip().split())

        result['pretty_url'] = prettify_url(result['url'])

        # TODO, check if timezone is calculated right
        if 'publishedDate' in result:
            try:  # test if publishedDate >= 1900 (datetime module bug)
                result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
            except ValueError:
                result['publishedDate'] = None
            else:
                if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
                    timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
                    minutes = int((timedifference.seconds / 60) % 60)
                    hours = int(timedifference.seconds / 60 / 60)
                    if hours == 0:
                        result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes)
                    else:
                        result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa
                else:
                    result['publishedDate'] = format_date(result['publishedDate'])

    if output_format == 'json':
        return Response(json.dumps({'query': search_query.query.decode('utf-8'),
                                    'number_of_results': number_of_results,
                                    'results': results,
                                    'answers': list(result_container.answers),
                                    'corrections': list(result_container.corrections),
                                    'infoboxes': result_container.infoboxes,
                                    'suggestions': list(result_container.suggestions),
                                    'unresponsive_engines': list(result_container.unresponsive_engines)},
                                   default=lambda item: list(item) if isinstance(item, set) else item),
                        mimetype='application/json')
    elif output_format == 'csv':
        csv = UnicodeWriter(StringIO())
        keys = ('title', 'url', 'content', 'host', 'engine', 'score')
        csv.writerow(keys)
        for row in results:
            row['host'] = row['parsed_url'].netloc
            csv.writerow([row.get(key, '') for key in keys])
        csv.stream.seek(0)
        response = Response(csv.stream.read(), mimetype='application/csv')
        cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
        response.headers.add('Content-Disposition', cont_disp)
        return response
    elif output_format == 'rss':
        response_rss = render(
            'opensearch_response_rss.xml',
            results=results,
            q=request.form['q'],
            number_of_results=number_of_results,
            base_url=get_base_url(),
            override_theme='__common__',
        )
        return Response(response_rss, mimetype='text/xml')

    return render(
        'results.html',
        results=results,
        q=request.form['q'],
        selected_categories=search_query.categories,
        pageno=search_query.pageno,
        time_range=search_query.time_range,
        number_of_results=format_decimal(number_of_results),
        advanced_search=advanced_search,
        suggestions=result_container.suggestions,
        answers=result_container.answers,
        corrections=result_container.corrections,
        infoboxes=result_container.infoboxes,
        paging=result_container.paging,
        unresponsive_engines=result_container.unresponsive_engines,
        current_language=match_language(search_query.lang,
                                        LANGUAGE_CODES,
                                        fallback=settings['search']['language']),
        base_url=get_base_url(),
        theme=get_current_theme_name(),
        favicons=global_favicons[themes.index(get_current_theme_name())]
    )
Example #40
0
def index():
    """Render index page.

    Supported outputs: html, json, csv, rss.
    """

    if request.form.get('q') is None:
        return render(
            'index.html',
        )

    # search
    search_query = None
    result_container = None
    try:
        search_query = get_search_query_from_webapp(request.preferences, request.form)
        # search = Search(search_query) #  without plugins
        search = SearchWithPlugins(search_query, request)
        result_container = search.search()
    except:
        request.errors.append(gettext('search error'))
        logger.exception('search error')
        return render(
            'index.html',
        )

    results = result_container.get_ordered_results()

    # UI
    advanced_search = request.form.get('advanced_search', None)
    output_format = request.form.get('format', 'html')
    if output_format not in ['html', 'csv', 'json', 'rss']:
        output_format = 'html'

    # output
    for result in results:
        if output_format == 'html':
            if 'content' in result and result['content']:
                result['content'] = highlight_content(result['content'][:1024], search_query.query.encode('utf-8'))
            result['title'] = highlight_content(result['title'], search_query.query.encode('utf-8'))
        else:
            if result.get('content'):
                result['content'] = html_to_text(result['content']).strip()
            # removing html content and whitespace duplications
            result['title'] = ' '.join(html_to_text(result['title']).strip().split())

        result['pretty_url'] = prettify_url(result['url'])

        # TODO, check if timezone is calculated right
        if 'publishedDate' in result:
            try:  # test if publishedDate >= 1900 (datetime module bug)
                result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
            except ValueError:
                result['publishedDate'] = None
            else:
                if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
                    timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
                    minutes = int((timedifference.seconds / 60) % 60)
                    hours = int(timedifference.seconds / 60 / 60)
                    if hours == 0:
                        result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes)
                    else:
                        result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa
                else:
                    result['publishedDate'] = format_date(result['publishedDate'])

    number_of_results = result_container.results_number()
    if number_of_results < result_container.results_length():
        number_of_results = 0

    if output_format == 'json':
        return Response(json.dumps({'query': search_query.query,
                                    'number_of_results': number_of_results,
                                    'results': results}),
                        mimetype='application/json')
    elif output_format == 'csv':
        csv = UnicodeWriter(cStringIO.StringIO())
        keys = ('title', 'url', 'content', 'host', 'engine', 'score')
        csv.writerow(keys)
        for row in results:
            row['host'] = row['parsed_url'].netloc
            csv.writerow([row.get(key, '') for key in keys])
        csv.stream.seek(0)
        response = Response(csv.stream.read(), mimetype='application/csv')
        cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query.encode('utf-8'))
        response.headers.add('Content-Disposition', cont_disp)
        return response
    elif output_format == 'rss':
        response_rss = render(
            'opensearch_response_rss.xml',
            results=results,
            q=request.form['q'],
            number_of_results=number_of_results,
            base_url=get_base_url()
        )
        return Response(response_rss, mimetype='text/xml')

    return render(
        'results.html',
        results=results,
        q=request.form['q'],
        selected_categories=search_query.categories,
        pageno=search_query.pageno,
        time_range=search_query.time_range,
        number_of_results=format_decimal(number_of_results),
        advanced_search=advanced_search,
        suggestions=result_container.suggestions,
        answers=result_container.answers,
        infoboxes=result_container.infoboxes,
        paging=result_container.paging,
        base_url=get_base_url(),
        theme=get_current_theme_name(),
        favicons=global_favicons[themes.index(get_current_theme_name())]
    )
Example #41
0
File: webapp.py Project: kvch/searx
def index():
    """Render index page.

    Supported outputs: html, json, csv, rss.
    """

    if request.form.get("q") is None:
        return render("index.html")

    # search
    search_query = None
    result_container = None
    try:
        search_query = get_search_query_from_webapp(request.preferences, request.form)
        # search = Search(search_query) #  without plugins
        search = SearchWithPlugins(search_query, request)
        result_container = search.search()
    except:
        request.errors.append(gettext("search error"))
        logger.exception("search error")
        return render("index.html")

    results = result_container.get_ordered_results()

    # UI
    advanced_search = request.form.get("advanced_search", None)
    output_format = request.form.get("format", "html")
    if output_format not in ["html", "csv", "json", "rss"]:
        output_format = "html"

    # output
    for result in results:
        if output_format == "html":
            if "content" in result and result["content"]:
                result["content"] = highlight_content(
                    escape(result["content"][:1024]), search_query.query.encode("utf-8")
                )
            result["title"] = highlight_content(escape(result["title"] or u""), search_query.query.encode("utf-8"))
        else:
            if result.get("content"):
                result["content"] = html_to_text(result["content"]).strip()
            # removing html content and whitespace duplications
            result["title"] = " ".join(html_to_text(result["title"]).strip().split())

        result["pretty_url"] = prettify_url(result["url"])

        # TODO, check if timezone is calculated right
        if "publishedDate" in result:
            try:  # test if publishedDate >= 1900 (datetime module bug)
                result["pubdate"] = result["publishedDate"].strftime("%Y-%m-%d %H:%M:%S%z")
            except ValueError:
                result["publishedDate"] = None
            else:
                if result["publishedDate"].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
                    timedifference = datetime.now() - result["publishedDate"].replace(tzinfo=None)
                    minutes = int((timedifference.seconds / 60) % 60)
                    hours = int(timedifference.seconds / 60 / 60)
                    if hours == 0:
                        result["publishedDate"] = gettext(u"{minutes} minute(s) ago").format(minutes=minutes)
                    else:
                        result["publishedDate"] = gettext(u"{hours} hour(s), {minutes} minute(s) ago").format(
                            hours=hours, minutes=minutes
                        )  # noqa
                else:
                    result["publishedDate"] = format_date(result["publishedDate"])

    number_of_results = result_container.results_number()
    if number_of_results < result_container.results_length():
        number_of_results = 0

    if output_format == "json":
        return Response(
            json.dumps(
                {
                    "query": search_query.query,
                    "number_of_results": number_of_results,
                    "results": results,
                    "answers": list(result_container.answers),
                    "infoboxes": result_container.infoboxes,
                    "suggestions": list(result_container.suggestions),
                }
            ),
            mimetype="application/json",
        )
    elif output_format == "csv":
        csv = UnicodeWriter(cStringIO.StringIO())
        keys = ("title", "url", "content", "host", "engine", "score")
        csv.writerow(keys)
        for row in results:
            row["host"] = row["parsed_url"].netloc
            csv.writerow([row.get(key, "") for key in keys])
        csv.stream.seek(0)
        response = Response(csv.stream.read(), mimetype="application/csv")
        cont_disp = "attachment;Filename=searx_-_{0}.csv".format(search_query.query.encode("utf-8"))
        response.headers.add("Content-Disposition", cont_disp)
        return response
    elif output_format == "rss":
        response_rss = render(
            "opensearch_response_rss.xml",
            results=results,
            q=request.form["q"],
            number_of_results=number_of_results,
            base_url=get_base_url(),
        )
        return Response(response_rss, mimetype="text/xml")

    return render(
        "results.html",
        results=results,
        q=request.form["q"],
        selected_categories=search_query.categories,
        pageno=search_query.pageno,
        time_range=search_query.time_range,
        number_of_results=format_decimal(number_of_results),
        advanced_search=advanced_search,
        suggestions=result_container.suggestions,
        answers=result_container.answers,
        infoboxes=result_container.infoboxes,
        paging=result_container.paging,
        base_url=get_base_url(),
        theme=get_current_theme_name(),
        favicons=global_favicons[themes.index(get_current_theme_name())],
    )
Example #42
0
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1
            # update stats with the total HTTP time
            engine.stats['page_load_time'] += requests_lib.get_time_for_thread()
            engine.stats['page_load_count'] += 1

    except Exception as e:
        search_duration = time() - start_time

        with threading.RLock():
            engine.stats['errors'] += 1

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine((engine_name, gettext('timeout')))
            # requests timeout (connect or read)
            logger.error("engine {0} : HTTP requests timeout"
                         "(search duration : {1} s, timeout: {2} s) : {3}"
                         .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine((engine_name, gettext('request exception')))
            # other requests exception
            logger.exception("engine {0} : requests exception"
                             "(search duration : {1} s, timeout: {2} s) : {3}"
                             .format(engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            result_container.add_unresponsive_engine((
                engine_name,
                u'{0}: {1}'.format(gettext('unexpected crash'), e),
            ))
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(engine_name, e))

    # suspend or not the engine if there are HTTP errors
    with threading.RLock():
        if requests_exception:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60, engine.continuous_errors)
        else:
            # no HTTP error (perhaps an engine error)
            # anyway, reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0
Example #43
0
def index():
    """Render index page.

    Supported outputs: html, json, csv, rss.
    """

    # output_format
    output_format = request.form.get('format', 'html')
    if output_format not in ['html', 'csv', 'json', 'rss']:
        output_format = 'html'

    # check if there is query
    if request.form.get('q') is None:
        if output_format == 'html':
            return render(
                'index.html',
            )
        else:
            return index_error(output_format, 'No query'), 400

    # search
    search_query = None
    raw_text_query = None
    result_container = None
    try:
        search_query, raw_text_query = get_search_query_from_webapp(request.preferences, request.form)
        # search = Search(search_query) #  without plugins
        search = SearchWithPlugins(search_query, request.user_plugins, request)
        result_container = search.search()
    except Exception as e:
        # log exception
        logger.exception('search error')

        # is it an invalid input parameter or something else ?
        if (issubclass(e.__class__, SearxParameterException)):
            return index_error(output_format, e.message), 400
        else:
            return index_error(output_format, gettext('search error')), 500

    # results
    results = result_container.get_ordered_results()
    number_of_results = result_container.results_number()
    if number_of_results < result_container.results_length():
        number_of_results = 0

    # UI
    advanced_search = request.form.get('advanced_search', None)

    # Server-Timing header
    request.timings = result_container.get_timings()

    # output
    for result in results:
        if output_format == 'html':
            if 'content' in result and result['content']:
                result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
            if 'title' in result and result['title']:
                result['title'] = highlight_content(escape(result['title'] or u''), search_query.query)
        else:
            if result.get('content'):
                result['content'] = html_to_text(result['content']).strip()
            # removing html content and whitespace duplications
            result['title'] = ' '.join(html_to_text(result['title']).strip().split())

        if 'url' in result:
            result['pretty_url'] = prettify_url(result['url'])

        # TODO, check if timezone is calculated right
        if 'publishedDate' in result:
            try:  # test if publishedDate >= 1900 (datetime module bug)
                result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
            except ValueError:
                result['publishedDate'] = None
            else:
                if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
                    timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
                    minutes = int((timedifference.seconds / 60) % 60)
                    hours = int(timedifference.seconds / 60 / 60)
                    if hours == 0:
                        result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes)
                    else:
                        result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa
                else:
                    result['publishedDate'] = format_date(result['publishedDate'])

    if output_format == 'json':
        return Response(json.dumps({'query': search_query.query.decode('utf-8'),
                                    'number_of_results': number_of_results,
                                    'results': results,
                                    'answers': list(result_container.answers),
                                    'corrections': list(result_container.corrections),
                                    'infoboxes': result_container.infoboxes,
                                    'suggestions': list(result_container.suggestions),
                                    'unresponsive_engines': __get_translated_errors(result_container.unresponsive_engines)},  # noqa
                                   default=lambda item: list(item) if isinstance(item, set) else item),
                        mimetype='application/json')
    elif output_format == 'csv':
        csv = UnicodeWriter(StringIO())
        keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
        csv.writerow(keys)
        for row in results:
            row['host'] = row['parsed_url'].netloc
            row['type'] = 'result'
            csv.writerow([row.get(key, '') for key in keys])
        for a in result_container.answers:
            row = {'title': a, 'type': 'answer'}
            csv.writerow([row.get(key, '') for key in keys])
        for a in result_container.suggestions:
            row = {'title': a, 'type': 'suggestion'}
            csv.writerow([row.get(key, '') for key in keys])
        for a in result_container.corrections:
            row = {'title': a, 'type': 'correction'}
            csv.writerow([row.get(key, '') for key in keys])
        csv.stream.seek(0)
        response = Response(csv.stream.read(), mimetype='application/csv')
        cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query.decode('utf-8'))
        response.headers.add('Content-Disposition', cont_disp)
        return response
    elif output_format == 'rss':
        response_rss = render(
            'opensearch_response_rss.xml',
            results=results,
            answers=result_container.answers,
            corrections=result_container.corrections,
            suggestions=result_container.suggestions,
            q=request.form['q'],
            number_of_results=number_of_results,
            base_url=get_base_url(),
            override_theme='__common__',
        )
        return Response(response_rss, mimetype='text/xml')

    # HTML output format

    # suggestions: use RawTextQuery to get the suggestion URLs with the same bang
    suggestion_urls = list(map(lambda suggestion: {
                               'url': raw_text_query.changeSearchQuery(suggestion).getFullQuery(),
                               'title': suggestion
                               },
                               result_container.suggestions))

    correction_urls = list(map(lambda correction: {
                               'url': raw_text_query.changeSearchQuery(correction).getFullQuery(),
                               'title': correction
                               },
                               result_container.corrections))
    #
    return render(
        'results.html',
        results=results,
        q=request.form['q'],
        selected_categories=search_query.categories,
        pageno=search_query.pageno,
        time_range=search_query.time_range,
        number_of_results=format_decimal(number_of_results),
        advanced_search=advanced_search,
        suggestions=suggestion_urls,
        answers=result_container.answers,
        corrections=correction_urls,
        infoboxes=result_container.infoboxes,
        paging=result_container.paging,
        unresponsive_engines=__get_translated_errors(result_container.unresponsive_engines),
        current_language=match_language(search_query.lang,
                                        LANGUAGE_CODES,
                                        fallback=request.preferences.get_value("language")),
        base_url=get_base_url(),
        theme=get_current_theme_name(),
        favicons=global_favicons[themes.index(get_current_theme_name())],
        timeout_limit=request.form.get('timeout_limit', None)
    )
Example #44
0
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
    # set timeout for all HTTP requests
    requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
    # reset the HTTP total time
    requests_lib.reset_time_for_thread()

    #
    engine = engines[engine_name]

    # suppose everything will be alright
    requests_exception = False

    try:
        # send requests and parse the results
        search_results = search_one_request(engine, query, request_params)

        # add results
        result_container.extend(engine_name, search_results)

        # update engine time when there is no exception
        with threading.RLock():
            engine.stats['engine_time'] += time() - start_time
            engine.stats['engine_time_count'] += 1
            # update stats with the total HTTP time
            engine.stats['page_load_time'] += requests_lib.get_time_for_thread()
            engine.stats['page_load_count'] += 1

    except Exception as e:
        search_duration = time() - start_time

        with threading.RLock():
            engine.stats['errors'] += 1

        if (issubclass(e.__class__, requests.exceptions.Timeout)):
            result_container.add_unresponsive_engine((engine_name, gettext('timeout')))
            # requests timeout (connect or read)
            logger.error("engine {0} : HTTP requests timeout"
                         "(search duration : {1} s, timeout: {2} s) : {3}"
                         .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
            requests_exception = True
        elif (issubclass(e.__class__, requests.exceptions.RequestException)):
            result_container.add_unresponsive_engine((engine_name, gettext('request exception')))
            # other requests exception
            logger.exception("engine {0} : requests exception"
                             "(search duration : {1} s, timeout: {2} s) : {3}"
                             .format(engine_name, search_duration, timeout_limit, e))
            requests_exception = True
        else:
            result_container.add_unresponsive_engine((
                engine_name,
                u'{0}: {1}'.format(gettext('unexpected crash'), e),
            ))
            # others errors
            logger.exception('engine {0} : exception : {1}'.format(engine_name, e))

    # suspend or not the engine if there are HTTP errors
    with threading.RLock():
        if requests_exception:
            # update continuous_errors / suspend_end_time
            engine.continuous_errors += 1
            engine.suspend_end_time = time() + min(60, engine.continuous_errors)
        else:
            # no HTTP error (perhaps an engine error)
            # anyway, reset the suspend variables
            engine.continuous_errors = 0
            engine.suspend_end_time = 0