Esempio n. 1
0
def search(query, request, selected_engines):
    global engines, categories, number_of_searches
    requests = []
    results = {}
    suggestions = set()
    number_of_searches += 1
    #user_agent = request.headers.get('User-Agent', '')
    user_agent = gen_useragent()

    for selected_engine in selected_engines:
        if selected_engine['name'] not in engines:
            continue

        engine = engines[selected_engine['name']]

        request_params = default_request_params()
        request_params['headers']['User-Agent'] = user_agent
        request_params['category'] = selected_engine['category']
        request_params['started'] = datetime.now()
        request_params = engine.request(query, request_params)

        callback = make_callback(
            selected_engine['name'],
            results,
            suggestions,
            engine.response,
            request_params
        )

        request_args = dict(
            headers=request_params['headers'],
            hooks=dict(response=callback),
            cookies=request_params['cookies'],
            timeout=settings['server']['request_timeout']
        )

        if request_params['method'] == 'GET':
            req = grequests.get
        else:
            req = grequests.post
            request_args['data'] = request_params['data']

        # ignoring empty urls
        if not request_params['url']:
            continue

        requests.append(req(request_params['url'], **request_args))
    grequests.map(requests)
    for engine_name, engine_results in results.items():
        engines[engine_name].stats['search_count'] += 1
        engines[engine_name].stats['result_count'] += len(engine_results)

    results = score_results(results)

    for result in results:
        for res_engine in result['engines']:
            engines[result['engine']].stats['score_count'] += result['score']

    return results, suggestions
Esempio n. 2
0
def image_proxy():
    url = request.args.get('url').encode('utf-8')

    if not url:
        return '', 400

    h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest()

    if h != request.args.get('h'):
        return '', 400

    headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
    headers['User-Agent'] = gen_useragent()

    resp = requests.get(url,
                        stream=True,
                        timeout=settings['outgoing']['request_timeout'],
                        headers=headers,
                        proxies=outgoing_proxies)

    if resp.status_code == 304:
        return '', resp.status_code

    if resp.status_code != 200:
        logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
        if resp.status_code >= 400:
            return '', resp.status_code
        return '', 400

    if not resp.headers.get('content-type', '').startswith('image/'):
        logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
        return '', 400

    img = ''
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return '', 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})

    return Response(img, mimetype=resp.headers['content-type'], headers=headers)
Esempio n. 3
0
def image_proxy():
    url = request.args.get('url').encode('utf-8')

    if not url:
        return '', 400

    h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest()

    if h != request.args.get('h'):
        return '', 400

    headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
    headers['User-Agent'] = gen_useragent()

    resp = http_get(url,
                    stream=True,
                    timeout=settings['server'].get('request_timeout', 2),
                    headers=headers)

    if resp.status_code == 304:
        return '', resp.status_code

    if resp.status_code != 200:
        logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
        if resp.status_code >= 400:
            return '', resp.status_code
        return '', 400

    if not resp.headers.get('content-type', '').startswith('image/'):
        logger.debug('image-proxy: wrong content-type: {0}'.format(resp.get('content-type')))
        return '', 400

    img = ''
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return '', 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})

    return Response(img, mimetype=resp.headers['content-type'], headers=headers)
Esempio n. 4
0
def image_proxy():
    url = request.args.get("url").encode("utf-8")

    if not url:
        return "", 400

    h = hashlib.sha256(url + settings["server"]["secret_key"].encode("utf-8")).hexdigest()

    if h != request.args.get("h"):
        return "", 400

    headers = dict_subset(request.headers, {"If-Modified-Since", "If-None-Match"})
    headers["User-Agent"] = gen_useragent()

    resp = requests.get(
        url, stream=True, timeout=settings["outgoing"]["request_timeout"], headers=headers, proxies=outgoing_proxies
    )

    if resp.status_code == 304:
        return "", resp.status_code

    if resp.status_code != 200:
        logger.debug("image-proxy: wrong response code: {0}".format(resp.status_code))
        if resp.status_code >= 400:
            return "", resp.status_code
        return "", 400

    if not resp.headers.get("content-type", "").startswith("image/"):
        logger.debug("image-proxy: wrong content-type: {0}".format(resp.headers.get("content-type")))
        return "", 400

    img = ""
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return "", 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {"Content-Length", "Length", "Date", "Last-Modified", "Expires", "Etag"})

    return Response(img, mimetype=resp.headers["content-type"], headers=headers)
Esempio n. 5
0
def request(query, params):
    offset = (params['pageno'] - 1) * 10 + 1

    if params['language'] == 'all':
        lang = 'EN'
    else:
        lang = match_language(params['language'], supported_languages,
                              language_aliases)

    query = u'language:{} {}'.format(
        lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')

    search_path = search_string.format(query=urlencode({'q': query}),
                                       offset=offset)

    params['url'] = base_url + search_path

    params['headers']['User-Agent'] = gen_useragent('Windows NT 6.3; WOW64')

    return params
Esempio n. 6
0
    def search(self):
        global number_of_searches

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if search-language is set and engine does not
            # provide language-support, skip
            if search_query.lang != 'all' and not engine.language_support:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = time()
            request_params['pageno'] = search_query.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = search_query.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = search_query.safesearch
            request_params['time_range'] = search_query.time_range

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            engine.request(search_query.query.encode('utf-8'), request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            # create a callback wrapper for the search engine results
            callback = make_callback(
                selected_engine['name'],
                engine.response,
                request_params,
                self.result_container)

            # create dictionary which contain all
            # informations about the request
            request_args = dict(
                headers=request_params['headers'],
                hooks=dict(response=callback),
                cookies=request_params['cookies'],
                timeout=engine.timeout,
                verify=request_params['verify']
            )

            # specific type of request (GET or POST)
            if request_params['method'] == 'GET':
                req = requests_lib.get
            else:
                req = requests_lib.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            # append request to list
            requests.append((req, request_params['url'],
                             request_args,
                             selected_engine['name']))

        if not requests:
            return self.result_container
        # send all search-request
        threaded_requests(requests)
        start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Esempio n. 7
0
 def test_gen_useragent(self):
     self.assertIsInstance(utils.gen_useragent(), str)
     self.assertIsNotNone(utils.gen_useragent())
     self.assertTrue(utils.gen_useragent().startswith('Mozilla'))
Esempio n. 8
0
 def test_gen_useragent(self):
     self.assertIsInstance(utils.gen_useragent(), str)
     self.assertIsNotNone(utils.gen_useragent())
     self.assertTrue(utils.gen_useragent().startswith('Mozilla'))
Esempio n. 9
0
    def search(self, request):
        global number_of_searches

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        # start search-reqest for all selected engines
        for selected_engine in self.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # if paging is not supported, skip
            if self.pageno > 1 and not engine.paging:
                continue

            # if search-language is set and engine does not
            # provide language-support, skip
            if self.lang != 'all' and not engine.language_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = time()
            request_params['pageno'] = self.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = self.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = request.preferences.get_value(
                'safesearch')

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            engine.request(self.query.encode('utf-8'), request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            # create a callback wrapper for the search engine results
            callback = make_callback(selected_engine['name'], engine.response,
                                     request_params, self.result_container)

            # create dictionary which contain all
            # informations about the request
            request_args = dict(headers=request_params['headers'],
                                hooks=dict(response=callback),
                                cookies=request_params['cookies'],
                                timeout=engine.timeout,
                                verify=request_params['verify'])

            # specific type of request (GET or POST)
            if request_params['method'] == 'GET':
                req = requests_lib.get
            else:
                req = requests_lib.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            # append request to list
            requests.append((req, request_params['url'], request_args,
                             selected_engine['name']))

        if not requests:
            return self
        # send all search-request
        threaded_requests(requests)

        # return results, suggestions, answers and infoboxes
        return self
Esempio n. 10
0
    def search(self, task):
        global number_of_searches

        # init vars
        requests = []
        results_queue = Queue()
        results = {}

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        # start search-reqest for all selected engines
        for selected_engine in self.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # if paging is not supported, skip
            if self.pageno > 1 and not engine.paging:
                continue

            # if search-language is set and engine does not
            # provide language-support, skip
            if self.lang != 'all' and not engine.language_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = time()
            request_params['pageno'] = self.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = self.lang

                # try:
                # 0 = None, 1 = Moderate, 2 = Strict
                # request_params['safesearch'] = int(request.cookies.get('safesearch'))
                # except Exception:
            request_params['safesearch'] = settings['search']['safe_search']

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            engine.request(task['query'].encode('utf-8'), request_params)

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            # create a callback wrapper for the search engine results
            callback = make_callback(
                selected_engine['name'],
                results_queue,
                engine.response,
                request_params)

            # create dictionary which contain all
            # informations about the request
            request_args = dict(
                headers=request_params['headers'],
                hooks=dict(response=callback),
                cookies=request_params['cookies'],
                timeout=engine.timeout,
                verify=request_params['verify']
            )

            # specific type of request (GET or POST)
            if request_params['method'] == 'GET':
                req = requests_lib.get
            else:
                req = requests_lib.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            # append request to list
            requests.append((req, request_params['url'],
                             request_args,
                             selected_engine['name']))

        if not requests:
            return self
        # send all search-request
        threaded_requests(requests)

        while not results_queue.empty():
            engine_name, engine_results = results_queue.get_nowait()

            # TODO type checks
            [self.suggestions.append(x['suggestion'])
             for x in list(engine_results)
             if 'suggestion' in x
             and engine_results.remove(x) is None]

            [self.answers.append(x['answer'])
             for x in list(engine_results)
             if 'answer' in x
             and engine_results.remove(x) is None]

            self.infoboxes.extend(x for x in list(engine_results)
                                  if 'infobox' in x
                                  and engine_results.remove(x) is None)

            results[engine_name] = engine_results

        # update engine-specific stats
        for engine_name, engine_results in results.items():
            engines[engine_name].stats['search_count'] += 1
            engines[engine_name].stats['result_count'] += len(engine_results)

        # score results and remove duplications
        self.results = score_results(results)

        # merge infoboxes according to their ids
        self.infoboxes = merge_infoboxes(self.infoboxes)

        # update engine stats, using calculated score
        for result in self.results:
            plugins.callAPI('on_result', self.plugins, locals())

            for res_engine in result['engines']:
                engines[result['engine']] \
                    .stats['score_count'] += result['score']

            result['pretty_url'] = prettify_url(result['url'])

            # TODO, check if timezone is calculated right
            if 'publishedDate' in result:
                result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')

            if not self.paging and engines[result['engine']].paging:
                self.paging = True

            if 'content' in result:
                result['content_html'] = highlight_content(result['content'],
                                                           self.query.encode('utf-8'))  # noqa
            result['title_html'] = highlight_content(result['title'],
                                                     self.query.encode('utf-8'))

            if result.get('content'):
                result['content'] = html_to_text(result['content']).strip()
            # removing html content and whitespace duplications
            result['title'] = ' '.join(html_to_text(result['title']).strip().split())

            # return results, suggestions, answers and infoboxes
        return self
Esempio n. 11
0
    def search(self, request):
        global number_of_searches

        # init vars
        requests = []
        results_queue = Queue()
        results = {}
        suggestions = set()
        answers = set()
        infoboxes = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        # start search-reqest for all selected engines
        for selected_engine in self.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # if paging is not supported, skip
            if self.pageno > 1 and not engine.paging:
                continue

            # if search-language is set and engine does not
            # provide language-support, skip
            if self.lang != 'all' and not engine.language_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = time()
            request_params['pageno'] = self.pageno
            request_params['language'] = self.lang

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            engine.request(self.query.encode('utf-8'), request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            # create a callback wrapper for the search engine results
            callback = make_callback(
                selected_engine['name'],
                results_queue,
                engine.response,
                request_params)

            # create dictionary which contain all
            # informations about the request
            request_args = dict(
                headers=request_params['headers'],
                hooks=dict(response=callback),
                cookies=request_params['cookies'],
                timeout=engine.timeout,
                verify=request_params['verify']
            )

            # specific type of request (GET or POST)
            if request_params['method'] == 'GET':
                req = requests_lib.get
            else:
                req = requests_lib.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            # append request to list
            requests.append((req, request_params['url'], request_args, selected_engine['name']))

        if not requests:
            return results, suggestions, answers, infoboxes
        # send all search-request
        threaded_requests(requests)


        while not results_queue.empty():
            engine_name, engine_results = results_queue.get_nowait()

            # TODO type checks
            [suggestions.add(x['suggestion'])
             for x in list(engine_results)
             if 'suggestion' in x
             and engine_results.remove(x) is None]

            [answers.add(x['answer'])
             for x in list(engine_results)
             if 'answer' in x
             and engine_results.remove(x) is None]

            infoboxes.extend(x for x in list(engine_results)
                             if 'infobox' in x
                             and engine_results.remove(x) is None)

            results[engine_name] = engine_results

        # update engine-specific stats
        for engine_name, engine_results in results.items():
            engines[engine_name].stats['search_count'] += 1
            engines[engine_name].stats['result_count'] += len(engine_results)

        # score results and remove duplications
        results = score_results(results)

        # merge infoboxes according to their ids
        infoboxes = merge_infoboxes(infoboxes)

        # update engine stats, using calculated score
        for result in results:
            for res_engine in result['engines']:
                engines[result['engine']]\
                    .stats['score_count'] += result['score']

        # return results, suggestions, answers and infoboxes
        return results, suggestions, answers, infoboxes
Esempio n. 12
0
def image_proxy():
    url = request.args.get('url')

    if not url:
        return '', 400

    h = new_hmac(settings['server']['secret_key'], url.encode())

    if h != request.args.get('h'):
        return '', 400

    maximum_size = 5 * 1024 * 1024

    try:
        headers = dict_subset(request.headers,
                              {'If-Modified-Since', 'If-None-Match'})
        headers['User-Agent'] = gen_useragent()
        stream = http_stream(method='GET',
                             url=url,
                             headers=headers,
                             timeout=settings['outgoing']['request_timeout'],
                             follow_redirects=True,
                             max_redirects=20)

        resp = next(stream)
        content_length = resp.headers.get('Content-Length')
        if content_length and content_length.isdigit(
        ) and int(content_length) > maximum_size:
            return 'Max size', 400

        if resp.status_code == 304:
            return '', resp.status_code

        if resp.status_code != 200:
            logger.debug('image-proxy: wrong response code: {0}'.format(
                resp.status_code))
            if resp.status_code >= 400:
                return '', resp.status_code
            return '', 400

        if not resp.headers.get('content-type', '').startswith('image/'):
            logger.debug('image-proxy: wrong content-type: {0}'.format(
                resp.headers.get('content-type')))
            return '', 400

        headers = dict_subset(resp.headers, {
            'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires',
            'Etag'
        })

        total_length = 0

        def forward_chunk():
            nonlocal total_length
            for chunk in stream:
                total_length += len(chunk)
                if total_length > maximum_size:
                    break
                yield chunk

        return Response(forward_chunk(),
                        mimetype=resp.headers['Content-Type'],
                        headers=headers)
    except httpx.HTTPError:
        return '', 400
Esempio n. 13
0
    def search(self, request):
        global number_of_searches

        # init vars
        requests = []
        results_queue = Queue()
        results = {}

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        # start search-reqest for all selected engines
        for selected_engine in self.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # if paging is not supported, skip
            if self.pageno > 1 and not engine.paging:
                continue

            # if search-language is set and engine does not
            # provide language-support, skip
            if self.lang != 'all' and not engine.language_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = time()
            request_params['pageno'] = self.pageno

            if hasattr(engine, 'language'):
                request_params['language'] = engine.language
            else:
                request_params['language'] = self.lang

            try:
                # 0 = None, 1 = Moderate, 2 = Strict
                request_params['safesearch'] = int(
                    request.cookies.get('safesearch', 1))
            except ValueError:
                request_params['safesearch'] = 1

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            engine.request(self.query.encode('utf-8'), request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            # create a callback wrapper for the search engine results
            callback = make_callback(selected_engine['name'], results_queue,
                                     engine.response, request_params)

            # create dictionary which contain all
            # informations about the request
            request_args = dict(headers=request_params['headers'],
                                hooks=dict(response=callback),
                                cookies=request_params['cookies'],
                                timeout=engine.timeout,
                                verify=request_params['verify'])

            # specific type of request (GET or POST)
            if request_params['method'] == 'GET':
                req = requests_lib.get
            else:
                req = requests_lib.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            # append request to list
            requests.append((req, request_params['url'], request_args,
                             selected_engine['name']))

        if not requests:
            return self
        # send all search-request
        threaded_requests(requests)

        while not results_queue.empty():
            engine_name, engine_results = results_queue.get_nowait()

            # TODO type checks
            [
                self.suggestions.add(x['suggestion'])
                for x in list(engine_results)
                if 'suggestion' in x and engine_results.remove(x) is None
            ]

            [
                self.answers.add(x['answer']) for x in list(engine_results)
                if 'answer' in x and engine_results.remove(x) is None
            ]

            self.infoboxes.extend(
                x for x in list(engine_results)
                if 'infobox' in x and engine_results.remove(x) is None)

            results[engine_name] = engine_results

        # update engine-specific stats
        for engine_name, engine_results in results.items():
            engines[engine_name].stats['search_count'] += 1
            engines[engine_name].stats['result_count'] += len(engine_results)

        # score results and remove duplications
        self.results = score_results(results)

        # merge infoboxes according to their ids
        self.infoboxes = merge_infoboxes(self.infoboxes)

        # update engine stats, using calculated score
        for result in self.results:
            for res_engine in result['engines']:
                engines[result['engine']]\
                    .stats['score_count'] += result['score']

        # return results, suggestions, answers and infoboxes
        return self
Esempio n. 14
0
def image_proxy():
    # pylint: disable=too-many-return-statements, too-many-branches

    url = request.args.get('url')

    if not url:
        return '', 400

    h = new_hmac(settings['server']['secret_key'], url.encode())

    if h != request.args.get('h'):
        return '', 400

    maximum_size = 5 * 1024 * 1024
    forward_resp = False
    resp = None
    try:
        request_headers = {
            'User-Agent': gen_useragent(),
            'Accept': 'image/webp,*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Sec-GPC': '1',
            'DNT': '1',
        }
        set_context_network_name('image_proxy')
        stream = http_stream(method='GET',
                             url=url,
                             headers=request_headers,
                             timeout=settings['outgoing']['request_timeout'],
                             follow_redirects=True,
                             max_redirects=20)

        resp = next(stream)
        content_length = resp.headers.get('Content-Length')
        if content_length and content_length.isdigit(
        ) and int(content_length) > maximum_size:
            return 'Max size', 400

        if resp.status_code != 200:
            logger.debug('image-proxy: wrong response code: {0}'.format(
                resp.status_code))
            if resp.status_code >= 400:
                return '', resp.status_code
            return '', 400

        if not resp.headers.get('Content-Type', '').startswith('image/'):
            logger.debug('image-proxy: wrong content-type: %s',
                         resp.headers.get('Content-Type', ''))
            return '', 400

        forward_resp = True
    except httpx.HTTPError:
        logger.exception('HTTP error')
        return '', 400
    finally:
        if resp and not forward_resp:
            # the code is about to return an HTTP 400 error to the browser
            # we make sure to close the response between searxng and the HTTP server
            try:
                resp.close()
            except httpx.HTTPError:
                logger.exception('HTTP error on closing')

    try:
        headers = dict_subset(
            resp.headers,
            {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'})

        def forward_chunk():
            total_length = 0
            for chunk in stream:
                total_length += len(chunk)
                if total_length > maximum_size:
                    break
                yield chunk

        return Response(forward_chunk(),
                        mimetype=resp.headers['Content-Type'],
                        headers=headers)
    except httpx.HTTPError:
        return '', 400
Esempio n. 15
0
def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError):
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name, param_value in engine_data.items():
        if param_name == 'engine':
            pass
        elif param_name == 'categories':
            if param_value == 'none':
                engine.categories = []
            else:
                engine.categories = list(map(str.strip,
                                             param_value.split(',')))
        else:
            setattr(engine, param_name, param_value)

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in ENGINES_LANGUAGES:
        setattr(engine, 'supported_languages',
                ENGINES_LANGUAGES[engine_data['name']])

    # find custom aliases for non standard language codes
    if hasattr(engine, 'supported_languages'):
        if hasattr(engine, 'language_aliases'):
            language_aliases = getattr(engine, 'language_aliases')
        else:
            language_aliases = {}

        for engine_lang in getattr(engine, 'supported_languages'):
            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
               iso_lang not in getattr(engine, 'supported_languages'):
                language_aliases[iso_lang] = engine_lang

        setattr(engine, 'language_aliases', language_aliases)

    # language_support
    setattr(engine, 'language_support',
            len(getattr(engine, 'supported_languages', [])) > 0)

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        headers = {
            'User-Agent': gen_useragent(),
            'Accept-Language':
            'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3',  # bing needs a non-English language
        }
        setattr(
            engine, 'fetch_supported_languages',
            lambda: engine._fetch_supported_languages(
                get(engine.supported_languages_url, headers=headers)))

    # tor related settings
    if settings['outgoing'].get('using_tor_proxy'):
        # use onion url if using tor.
        if hasattr(engine, 'onion_url'):
            engine.search_url = engine.onion_url + getattr(
                engine, 'search_path', '')
    elif 'onions' in engine.categories:
        # exclude onion engines if not using tor.
        return None

    engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Esempio n. 16
0
    def search(self):
        global number_of_searches

        # start time
        start_time = time()

        # answeres ?
        answerers_results = ask(self.search_query)

        if answerers_results:
            for results in answerers_results:
                self.result_container.extend('answer', results)
            return self.result_container

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # max of all selected engine timeout
        timeout_limit = 0

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['pageno'] = search_query.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = search_query.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = search_query.safesearch
            request_params['time_range'] = search_query.time_range

            # append request to list
            requests.append((selected_engine['name'], search_query.query, request_params))

            # update timeout_limit
            timeout_limit = max(timeout_limit, engine.timeout)

        if requests:
            # send all search-request
            search_multiple_requests(requests, self.result_container, start_time, timeout_limit)
            start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Esempio n. 17
0
    def search(self):
        global number_of_searches

        # start time
        start_time = time()

        # answeres ?
        answerers_results = ask(self.search_query)

        if answerers_results:
            for results in answerers_results:
                self.result_container.extend('answer', results)
            return self.result_container

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # max of all selected engine timeout
        timeout_limit = 0

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['pageno'] = search_query.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = search_query.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = search_query.safesearch
            request_params['time_range'] = search_query.time_range

            # append request to list
            requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))

            # update timeout_limit
            timeout_limit = max(timeout_limit, engine.timeout)

        if requests:
            # send all search-request
            search_multiple_requests(requests, self.result_container, start_time, timeout_limit)
            start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Esempio n. 18
0
    def search(self):
        global number_of_searches

        # Check if there is a external bang. After that we can stop because the search will terminate.
        if self.search_query.external_bang:
            self.result_container.redirect_url = get_bang_url(
                self.search_query)

            # This means there was a valid bang and the
            # rest of the search does not need to be continued
            if isinstance(self.result_container.redirect_url, str):
                return self.result_container
        # start time
        start_time = time()

        # answeres ?
        answerers_results = ask(self.search_query)

        if answerers_results:
            for results in answerers_results:
                self.result_container.extend('answer', results)
            return self.result_container

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # max of all selected engine timeout
        default_timeout = 0

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            if not search_query.preferences.validate_token(engine):
                continue

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s',
                             selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = {}
            if not engine.offline:
                request_params = default_request_params()
                request_params['headers']['User-Agent'] = user_agent

                if hasattr(engine, 'language') and engine.language:
                    request_params['language'] = engine.language
                else:
                    request_params['language'] = search_query.lang

                request_params['safesearch'] = search_query.safesearch
                request_params['time_range'] = search_query.time_range

            request_params['category'] = selected_engine['category']
            request_params['pageno'] = search_query.pageno

            # append request to list
            requests.append(
                (selected_engine['name'], search_query.query, request_params))

            # update default_timeout
            default_timeout = max(default_timeout, engine.timeout)

        # adjust timeout
        self.actual_timeout = default_timeout
        query_timeout = self.search_query.timeout_limit

        if max_request_timeout is None and query_timeout is None:
            # No max, no user query: default_timeout
            pass
        elif max_request_timeout is None and query_timeout is not None:
            # No max, but user query: From user query except if above default
            self.actual_timeout = min(default_timeout, query_timeout)
        elif max_request_timeout is not None and query_timeout is None:
            # Max, no user query: Default except if above max
            self.actual_timeout = min(default_timeout, max_request_timeout)
        elif max_request_timeout is not None and query_timeout is not None:
            # Max & user query: From user query except if above max
            self.actual_timeout = min(query_timeout, max_request_timeout)

        logger.debug(
            "actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})"
            .format(self.actual_timeout, default_timeout, query_timeout,
                    max_request_timeout))

        # send all search-request
        if requests:
            search_multiple_requests(requests, self.result_container,
                                     start_time, self.actual_timeout)
            start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Esempio n. 19
0
    def search(self, request):
        global number_of_searches
        requests = []
        results = {}
        suggestions = set()
        number_of_searches += 1
        #user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        for selected_engine in self.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            if self.pageno > 1 and not engine.paging:
                continue

            if self.lang != 'all' and not engine.language_support:
                continue

            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = datetime.now()
            request_params['pageno'] = self.pageno
            request_params['language'] = self.lang
            request_params = engine.request(self.query.encode('utf-8'),
                                            request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            callback = make_callback(
                selected_engine['name'],
                results,
                suggestions,
                engine.response,
                request_params
            )

            request_args = dict(
                headers=request_params['headers'],
                hooks=dict(response=callback),
                cookies=request_params['cookies'],
                timeout=engine.timeout
            )

            if request_params['method'] == 'GET':
                req = grequests.get
            else:
                req = grequests.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            requests.append(req(request_params['url'], **request_args))
        grequests.map(requests)
        for engine_name, engine_results in results.items():
            engines[engine_name].stats['search_count'] += 1
            engines[engine_name].stats['result_count'] += len(engine_results)

        results = score_results(results)

        for result in results:
            for res_engine in result['engines']:
                engines[result['engine']]\
                    .stats['score_count'] += result['score']

        return results, suggestions