Ejemplo n.º 1
0
def get_time(claims, propertyName, locale, defaultValue=None):
    propValue = claims.get(propertyName, {})
    if len(propValue) == 0:
        return defaultValue

    result = []
    for e in propValue:
        mainsnak = e.get('mainsnak', {})

        datavalue = mainsnak.get('datavalue', {})
        if datavalue is not None:
            value = datavalue.get('value', '')
            result.append(value.get('time', ''))

    if len(result) == 0:
        date_string = defaultValue
    else:
        date_string = ', '.join(result)

    try:
        parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
    except:
        if date_string.startswith('-'):
            return date_string.split('T')[0]
        try:
            parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
        except:
            logger.debug('could not parse date %s', date_string)
            return date_string.split('T')[0]

    return format_date_by_locale(parsed_date, locale)
Ejemplo n.º 2
0
    def process_callback(response, **kwargs):
        # check if redirect comparing to the True value,
        # because resp can be a Mock object, and any attribut name returns something.
        if response.is_redirect is True:
            logger.debug('{0} redirect on: {1}'.format(engine_name, response))
            return

        response.search_params = params

        timeout_overhead = 0.2  # seconds
        search_duration = time() - params['started']
        timeout_limit = engines[engine_name].timeout + timeout_overhead
        if search_duration > timeout_limit:
            engines[engine_name].stats['page_load_time'] += timeout_limit
            engines[engine_name].stats['errors'] += 1
            return

        # callback
        search_results = callback(response)

        # add results
        for result in search_results:
            result['engine'] = engine_name

        results_queue.put_nowait((engine_name, search_results))

        # update stats with current page-load-time
        engines[engine_name].stats['page_load_time'] += search_duration
Ejemplo n.º 3
0
def run():
    logger.debug('starting webserver on %s:%s', settings['server']['port'], settings['server']['bind_address'])
    app.run(
        debug=searx_debug,
        use_debugger=searx_debug,
        port=settings['server']['port'],
        host=settings['server']['bind_address'],
        threaded=True
    )
Ejemplo n.º 4
0
def initialize_engines(engine_list):
    load_engines(engine_list)
    for engine in engines.items():
        if hasattr(engine, 'init'):
            init_fn = getattr(engine, engine_attr)

            def engine_init():
                init_fn()
                logger.debug('%s engine initialized', engine_data['name'])
            logger.debug('Starting background initialization of %s engine', engine_data['name'])
            threading.Thread(target=engine_init).start()
Ejemplo n.º 5
0
def image_proxy():
    url = request.args.get('url').encode('utf-8')

    if not url:
        return '', 400

    h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest()

    if h != request.args.get('h'):
        return '', 400

    headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
    headers['User-Agent'] = gen_useragent()

    resp = requests.get(url,
                        stream=True,
                        timeout=settings['outgoing']['request_timeout'],
                        headers=headers,
                        proxies=outgoing_proxies)

    if resp.status_code == 304:
        return '', resp.status_code

    if resp.status_code != 200:
        logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
        if resp.status_code >= 400:
            return '', resp.status_code
        return '', 400

    if not resp.headers.get('content-type', '').startswith('image/'):
        logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
        return '', 400

    img = ''
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return '', 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})

    return Response(img, mimetype=resp.headers['content-type'], headers=headers)
Ejemplo n.º 6
0
def code_highlighter(codelines, language=None):
    if not language:
        language = 'text'

    try:
        # find lexer by programing language
        lexer = get_lexer_by_name(language, stripall=True)
    except:
        # if lexer is not found, using default one
        logger.debug('highlighter cannot find lexer for {0}'.format(language))
        lexer = get_lexer_by_name('text', stripall=True)

    html_code = ''
    tmp_code = ''
    last_line = None

    # parse lines
    for line, code in codelines:
        if not last_line:
            line_code_start = line

        # new codeblock is detected
        if last_line is not None and\
           last_line + 1 != line:

            # highlight last codepart
            formatter = HtmlFormatter(linenos='inline',
                                      linenostart=line_code_start)
            html_code = html_code + highlight(tmp_code, lexer, formatter)

            # reset conditions for next codepart
            tmp_code = ''
            line_code_start = line

        # add codepart
        tmp_code += code + '\n'

        # update line
        last_line = line

    # highlight last codepart
    formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start)
    html_code = html_code + highlight(tmp_code, lexer, formatter)

    return html_code
Ejemplo n.º 7
0
def image_proxy():
    url = request.args.get("url").encode("utf-8")

    if not url:
        return "", 400

    h = hashlib.sha256(url + settings["server"]["secret_key"].encode("utf-8")).hexdigest()

    if h != request.args.get("h"):
        return "", 400

    headers = dict_subset(request.headers, {"If-Modified-Since", "If-None-Match"})
    headers["User-Agent"] = gen_useragent()

    resp = requests.get(
        url, stream=True, timeout=settings["outgoing"]["request_timeout"], headers=headers, proxies=outgoing_proxies
    )

    if resp.status_code == 304:
        return "", resp.status_code

    if resp.status_code != 200:
        logger.debug("image-proxy: wrong response code: {0}".format(resp.status_code))
        if resp.status_code >= 400:
            return "", resp.status_code
        return "", 400

    if not resp.headers.get("content-type", "").startswith("image/"):
        logger.debug("image-proxy: wrong content-type: {0}".format(resp.headers.get("content-type")))
        return "", 400

    img = ""
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return "", 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {"Content-Length", "Length", "Date", "Last-Modified", "Expires", "Etag"})

    return Response(img, mimetype=resp.headers["content-type"], headers=headers)
Ejemplo n.º 8
0
def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []

    dom = html.fromstring(resp.text)

    try:
        number_of_results_string = re.sub('[^0-9]', '', dom.xpath(
            '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]
        )

        results.append({'number_of_results': int(number_of_results_string)})

    except:
        logger.debug("Couldn't read number of results.")
        pass

    for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'):
        try:
            logger.debug("running for %s" % str(result))
            link = result.xpath('.//h2/a')[0]
            url = link.attrib.get('href')
            title = result.xpath('string(.//h2/a)')
            content = extract_text(result.xpath('.//p'))
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content})
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    return results
Ejemplo n.º 9
0
    from io import StringIO


if sys.version_info[0] == 3:
    unicode = str
    PY3 = True
else:
    PY3 = False

# serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))

# about static
static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path'])
logger.debug('static directory is %s', static_path)
static_files = get_static_files(static_path)

# about templates
default_theme = settings['ui']['default_theme']
templates_path = get_resources_directory(searx_dir, 'templates', settings['ui']['templates_path'])
logger.debug('templates directory is %s', templates_path)
themes = get_themes(templates_path)
result_templates = get_result_templates(templates_path)
global_favicons = []
for indice, theme in enumerate(themes):
    global_favicons.append([])
    theme_img_path = os.path.join(static_path, 'themes', theme, 'img', 'icons')
    for (dirpath, dirnames, filenames) in os.walk(theme_img_path):
        global_favicons[indice].extend(filenames)
Ejemplo n.º 10
0
    def search(self):
        global number_of_searches

        # start time
        start_time = time()

        # answeres ?
        answerers_results = ask(self.search_query)

        if answerers_results:
            for results in answerers_results:
                self.result_container.extend('answer', results)
            return self.result_container

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # max of all selected engine timeout
        timeout_limit = 0

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['pageno'] = search_query.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = search_query.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = search_query.safesearch
            request_params['time_range'] = search_query.time_range

            # append request to list
            requests.append((selected_engine['name'], search_query.query, request_params))

            # update timeout_limit
            timeout_limit = max(timeout_limit, engine.timeout)

        if requests:
            # send all search-request
            search_multiple_requests(requests, self.result_container, start_time, timeout_limit)
            start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Ejemplo n.º 11
0
    def search(self):
        global number_of_searches

        # start time
        start_time = time()

        # answeres ?
        answerers_results = ask(self.search_query)

        if answerers_results:
            for results in answerers_results:
                self.result_container.extend('answer', results)
            return self.result_container

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # max of all selected engine timeout
        default_timeout = 0

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            if not search_query.preferences.validate_token(engine):
                continue

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = {}
            if not engine.offline:
                request_params = default_request_params()
                request_params['headers']['User-Agent'] = user_agent

                if hasattr(engine, 'language') and engine.language:
                    request_params['language'] = engine.language
                else:
                    request_params['language'] = search_query.lang

                request_params['safesearch'] = search_query.safesearch
                request_params['time_range'] = search_query.time_range

            request_params['category'] = selected_engine['category']
            request_params['pageno'] = search_query.pageno

            # append request to list
            requests.append((selected_engine['name'], search_query.query, request_params))

            # update default_timeout
            default_timeout = max(default_timeout, engine.timeout)

        # adjust timeout
        self.actual_timeout = default_timeout
        query_timeout = self.search_query.timeout_limit

        if max_request_timeout is None and query_timeout is None:
            # No max, no user query: default_timeout
            pass
        elif max_request_timeout is None and query_timeout is not None:
            # No max, but user query: From user query except if above default
            self.actual_timeout = min(default_timeout, query_timeout)
        elif max_request_timeout is not None and query_timeout is None:
            # Max, no user query: Default except if above max
            self.actual_timeout = min(default_timeout, max_request_timeout)
        elif max_request_timeout is not None and query_timeout is not None:
            # Max & user query: From user query except if above max
            self.actual_timeout = min(query_timeout, max_request_timeout)

        logger.debug("actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})"
                     .format(self.actual_timeout, default_timeout, query_timeout, max_request_timeout))

        # send all search-request
        if requests:
            search_multiple_requests(requests, self.result_container, start_time, self.actual_timeout)
            start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Ejemplo n.º 12
0
def request(query, params):
    params['url'] = search_url + urlencode({
        'query': query, 'page': params['pageno'], 'per_page': page_size
    })
    logger.debug("query_url --> %s", params['url'])
    return params
Ejemplo n.º 13
0
    from io import StringIO


if sys.version_info[0] == 3:
    unicode = str
    PY3 = True
else:
    PY3 = False

# serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))

# about static
static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path'])
logger.debug('static directory is %s', static_path)
static_files = get_static_files(static_path)

# about templates
default_theme = settings['ui']['default_theme']
templates_path = get_resources_directory(searx_dir, 'templates', settings['ui']['templates_path'])
logger.debug('templates directory is %s', templates_path)
themes = get_themes(templates_path)
result_templates = get_result_templates(templates_path)
global_favicons = []
for indice, theme in enumerate(themes):
    global_favicons.append([])
    theme_img_path = os.path.join(static_path, 'themes', theme, 'img', 'icons')
    for (dirpath, dirnames, filenames) in os.walk(theme_img_path):
        global_favicons[indice].extend(filenames)
Ejemplo n.º 14
0
 def dump(self):
     logger.debug("Histograms:")
     ks = sorted(self.measures.keys(), key='/'.join)
     for k in ks:
         logger.debug("- %-60s %s", '|'.join(k), self.measures[k])
Ejemplo n.º 15
0
    def search(self):
        global number_of_searches

        # start time
        start_time = time()

        # answeres ?
        answerers_results = ask(self.search_query)

        if answerers_results:
            for results in answerers_results:
                self.result_container.extend('answer', results)
            return self.result_container

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # max of all selected engine timeout
        timeout_limit = 0

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['pageno'] = search_query.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = search_query.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = search_query.safesearch
            request_params['time_range'] = search_query.time_range

            # append request to list
            requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))

            # update timeout_limit
            timeout_limit = max(timeout_limit, engine.timeout)

        if requests:
            # send all search-request
            search_multiple_requests(requests, self.result_container, start_time, timeout_limit)
            start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Ejemplo n.º 16
0
def response(resp):
    results = []

    search_res = json.loads(resp.text)

    # search_res.get('Entity') possible values (not exhaustive) :
    # * continent / country / department / location / waterfall
    # * actor / musician / artist
    # * book / performing art / film / television  / media franchise / concert tour / playwright
    # * prepared food
    # * website / software / os / programming language / file format / software engineer
    # * compagny

    content = ''
    heading = search_res.get('Heading', '')
    attributes = []
    urls = []
    infobox_id = None
    relatedTopics = []

    # add answer if there is one
    answer = search_res.get('Answer', '')
    if answer:
        logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
        if search_res.get('AnswerType') not in ['calc', 'ip']:
            results.append({'answer': html_to_text(answer)})

    # add infobox
    if 'Definition' in search_res:
        content = content + search_res.get('Definition', '')

    if 'Abstract' in search_res:
        content = content + search_res.get('Abstract', '')

    # image
    image = search_res.get('Image')
    image = None if image == '' else image
    if image is not None and urlparse(image).netloc == '':
        image = urljoin('https://duckduckgo.com', image)

    # urls
    # Official website, Wikipedia page
    for ddg_result in search_res.get('Results', []):
        firstURL = ddg_result.get('FirstURL')
        text = ddg_result.get('Text')
        if firstURL is not None and text is not None:
            urls.append({'title': text, 'url': firstURL})
            results.append({'title': heading, 'url': firstURL})

    # related topics
    for ddg_result in search_res.get('RelatedTopics', []):
        if 'FirstURL' in ddg_result:
            firstURL = ddg_result.get('FirstURL')
            text = ddg_result.get('Text')
            if not is_broken_text(text):
                suggestion = result_to_text(text,
                                            ddg_result.get('Result'))
                if suggestion != heading and suggestion is not None:
                    results.append({'suggestion': suggestion})
        elif 'Topics' in ddg_result:
            suggestions = []
            relatedTopics.append({'name': ddg_result.get('Name', ''),
                                  'suggestions': suggestions})
            for topic_result in ddg_result.get('Topics', []):
                suggestion = result_to_text(topic_result.get('Text'),
                                            topic_result.get('Result'))
                if suggestion != heading and suggestion is not None:
                    suggestions.append(suggestion)

    # abstract
    abstractURL = search_res.get('AbstractURL', '')
    if abstractURL != '':
        # add as result ? problem always in english
        infobox_id = abstractURL
        urls.append({'title': search_res.get('AbstractSource'),
                     'url': abstractURL,
                     'official': True})
        results.append({'url': abstractURL,
                        'title': heading})

    # definition
    definitionURL = search_res.get('DefinitionURL', '')
    if definitionURL != '':
        # add as result ? as answer ? problem always in english
        infobox_id = definitionURL
        urls.append({'title': search_res.get('DefinitionSource'),
                     'url': definitionURL})

    # to merge with wikidata's infobox
    if infobox_id:
        infobox_id = replace_http_by_https(infobox_id)

    # attributes
    # some will be converted to urls
    if 'Infobox' in search_res:
        infobox = search_res.get('Infobox')
        if 'content' in infobox:
            osm_zoom = 17
            coordinates = None
            for info in infobox.get('content'):
                data_type = info.get('data_type')
                data_label = info.get('label')
                data_value = info.get('value')

                # Workaround: ddg may return a double quote
                if data_value == '""':
                    continue

                # Is it an external URL ?
                # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
                # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
                # * netflix_id
                external_url = get_external_url(data_type, data_value)
                if external_url is not None:
                    urls.append({'title': data_label,
                                 'url': external_url})
                elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
                    # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
                    # ignore wiki_maps_trigger: reference to a javascript
                    # ignore google_play_artist_id: service shutdown
                    pass
                elif data_type == 'string' and data_label == 'Website':
                    # There is already an URL for the website
                    pass
                elif data_type == 'area':
                    attributes.append({'label': data_label,
                                       'value': area_to_str(data_value),
                                       'entity': 'P2046'})
                    osm_zoom = area_to_osm_zoom(data_value.get('amount'))
                elif data_type == 'coordinates':
                    if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
                        # coordinate on Earth
                        # get the zoom information from the area
                        coordinates = info
                    else:
                        # coordinate NOT on Earth
                        attributes.append({'label': data_label,
                                           'value': data_value,
                                           'entity': 'P625'})
                elif data_type == 'string':
                    attributes.append({'label': data_label,
                                       'value': data_value})

            if coordinates:
                data_label = coordinates.get('label')
                data_value = coordinates.get('value')
                latitude = data_value.get('latitude')
                longitude = data_value.get('longitude')
                url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
                urls.append({'title': 'OpenStreetMap',
                             'url': url,
                             'entity': 'P625'})

    if len(heading) > 0:
        # TODO get infobox.meta.value where .label='article_title'
        if image is None and len(attributes) == 0 and len(urls) == 1 and\
           len(relatedTopics) == 0 and len(content) == 0:
            results.append({'url': urls[0]['url'],
                            'title': heading,
                            'content': content})
        else:
            results.append({'infobox': heading,
                            'id': infobox_id,
                            'content': content,
                            'img_src': image,
                            'attributes': attributes,
                            'urls': urls,
                            'relatedTopics': relatedTopics})

    return results
Ejemplo n.º 17
0
def _run_with_delay():
    every = _get_every()
    delay = random.randint(0, every[1] - every[0])
    logger.debug('Start checker in %i seconds', delay)
    time.sleep(delay)
    run()
Ejemplo n.º 18
0
 def engine_init(engine_name, init_fn):
     init_fn(get_engine_from_settings(engine_name))
     logger.debug('%s engine: Initialized', engine_name)
Ejemplo n.º 19
0
def request(query, params):
    params['url'] = SEARCH_URL.format(query=query)
    logger.debug(f"query_url --> {params['url']}")
    return params
Ejemplo n.º 20
0
 def engine_init():
     init_fn()
     logger.debug('%s engine initialized', engine_name)
Ejemplo n.º 21
0
def add_error_context(engine_name: str, error_context: ErrorContext) -> None:
    errors_for_engine = errors_per_engines.setdefault(engine_name, {})
    errors_for_engine[error_context] = errors_for_engine.get(error_context,
                                                             0) + 1
    logger.debug('%s: %s', engine_name, str(error_context))
Ejemplo n.º 22
0
 def dump(self):
     with self.lock:
         ks = sorted(self.counters.keys(), key='/'.join)
     logger.debug("Counters:")
     for k in ks:
         logger.debug("- %-60s %s", '|'.join(k), self.counters[k])
Ejemplo n.º 23
0
 def engine_init():
     init_fn()
     logger.debug('%s engine initialized', engine_data['name'])
Ejemplo n.º 24
0
 def engine_init(engine_name, init_fn):
     init_fn()
     logger.debug('%s engine: Initialized', engine_name)
Ejemplo n.º 25
0
def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
                          .split()[1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Ejemplo n.º 26
0
def image_proxy():
    # pylint: disable=too-many-return-statements, too-many-branches

    url = request.args.get('url')

    if not url:
        return '', 400

    h = new_hmac(settings['server']['secret_key'], url.encode())

    if h != request.args.get('h'):
        return '', 400

    maximum_size = 5 * 1024 * 1024
    forward_resp = False
    resp = None
    try:
        request_headers = {
            'User-Agent': gen_useragent(),
            'Accept': 'image/webp,*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Sec-GPC': '1',
            'DNT': '1',
        }
        set_context_network_name('image_proxy')
        stream = http_stream(method='GET',
                             url=url,
                             headers=request_headers,
                             timeout=settings['outgoing']['request_timeout'],
                             follow_redirects=True,
                             max_redirects=20)

        resp = next(stream)
        content_length = resp.headers.get('Content-Length')
        if content_length and content_length.isdigit(
        ) and int(content_length) > maximum_size:
            return 'Max size', 400

        if resp.status_code != 200:
            logger.debug('image-proxy: wrong response code: {0}'.format(
                resp.status_code))
            if resp.status_code >= 400:
                return '', resp.status_code
            return '', 400

        if not resp.headers.get('Content-Type', '').startswith('image/'):
            logger.debug('image-proxy: wrong content-type: %s',
                         resp.headers.get('Content-Type', ''))
            return '', 400

        forward_resp = True
    except httpx.HTTPError:
        logger.exception('HTTP error')
        return '', 400
    finally:
        if resp and not forward_resp:
            # the code is about to return an HTTP 400 error to the browser
            # we make sure to close the response between searxng and the HTTP server
            try:
                resp.close()
            except httpx.HTTPError:
                logger.exception('HTTP error on closing')

    try:
        headers = dict_subset(
            resp.headers,
            {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'})

        def forward_chunk():
            total_length = 0
            for chunk in stream:
                total_length += len(chunk)
                if total_length > maximum_size:
                    break
                yield chunk

        return Response(forward_chunk(),
                        mimetype=resp.headers['Content-Type'],
                        headers=headers)
    except httpx.HTTPError:
        return '', 400
Ejemplo n.º 27
0
def image_proxy():
    url = request.args.get('url')

    if not url:
        return '', 400

    h = new_hmac(settings['server']['secret_key'], url.encode())

    if h != request.args.get('h'):
        return '', 400

    maximum_size = 5 * 1024 * 1024

    try:
        headers = dict_subset(request.headers,
                              {'If-Modified-Since', 'If-None-Match'})
        headers['User-Agent'] = gen_useragent()
        stream = http_stream(method='GET',
                             url=url,
                             headers=headers,
                             timeout=settings['outgoing']['request_timeout'],
                             allow_redirects=True,
                             max_redirects=20)

        resp = next(stream)
        content_length = resp.headers.get('Content-Length')
        if content_length and content_length.isdigit(
        ) and int(content_length) > maximum_size:
            return 'Max size', 400

        if resp.status_code == 304:
            return '', resp.status_code

        if resp.status_code != 200:
            logger.debug('image-proxy: wrong response code: {0}'.format(
                resp.status_code))
            if resp.status_code >= 400:
                return '', resp.status_code
            return '', 400

        if not resp.headers.get('content-type', '').startswith('image/'):
            logger.debug('image-proxy: wrong content-type: {0}'.format(
                resp.headers.get('content-type')))
            return '', 400

        headers = dict_subset(resp.headers, {
            'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires',
            'Etag'
        })

        total_length = 0

        def forward_chunk():
            nonlocal total_length
            for chunk in stream:
                total_length += len(chunk)
                if total_length > maximum_size:
                    break
                yield chunk

        return Response(forward_chunk(),
                        mimetype=resp.headers['Content-Type'],
                        headers=headers)
    except httpx.HTTPError:
        return '', 400
Ejemplo n.º 28
0
    def search(self):
        global number_of_searches

        # init vars
        requests = []

        # increase number of searches
        number_of_searches += 1

        # set default useragent
        # user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        search_query = self.search_query

        # start search-reqest for all selected engines
        for selected_engine in search_query.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            # skip suspended engines
            if engine.suspend_end_time >= time():
                logger.debug('Engine currently suspended: %s', selected_engine['name'])
                continue

            # if paging is not supported, skip
            if search_query.pageno > 1 and not engine.paging:
                continue

            # if search-language is set and engine does not
            # provide language-support, skip
            if search_query.lang != 'all' and not engine.language_support:
                continue

            # if time_range is not supported, skip
            if search_query.time_range and not engine.time_range_support:
                continue

            # set default request parameters
            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = time()
            request_params['pageno'] = search_query.pageno

            if hasattr(engine, 'language') and engine.language:
                request_params['language'] = engine.language
            else:
                request_params['language'] = search_query.lang

            # 0 = None, 1 = Moderate, 2 = Strict
            request_params['safesearch'] = search_query.safesearch
            request_params['time_range'] = search_query.time_range

            # update request parameters dependent on
            # search-engine (contained in engines folder)
            engine.request(search_query.query.encode('utf-8'), request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            # create a callback wrapper for the search engine results
            callback = make_callback(
                selected_engine['name'],
                engine.response,
                request_params,
                self.result_container)

            # create dictionary which contain all
            # informations about the request
            request_args = dict(
                headers=request_params['headers'],
                hooks=dict(response=callback),
                cookies=request_params['cookies'],
                timeout=engine.timeout,
                verify=request_params['verify']
            )

            # specific type of request (GET or POST)
            if request_params['method'] == 'GET':
                req = requests_lib.get
            else:
                req = requests_lib.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            # append request to list
            requests.append((req, request_params['url'],
                             request_args,
                             selected_engine['name']))

        if not requests:
            return self.result_container
        # send all search-request
        threaded_requests(requests)
        start_new_thread(gc.collect, tuple())

        # return results, suggestions, answers and infoboxes
        return self.result_container
Ejemplo n.º 29
0
from searx.search import SearchWithPlugins, initialize as search_initialize
from searx.network import stream as http_stream
from searx.search.checker import get_result as checker_get_result
from searx.settings_loader import get_default_settings_path

logger = logger.getChild('webapp')

# check secret_key
if not searx_debug and settings['server']['secret_key'] == 'ultrasecretkey':
    logger.error(
        'server.secret_key is not changed. Please use something else instead of ultrasecretkey.'
    )
    sys.exit(1)

# about static
logger.debug('static directory is %s', settings['ui']['static_path'])
static_files = get_static_files(settings['ui']['static_path'])

# about templates
logger.debug('templates directory is %s', settings['ui']['templates_path'])
default_theme = settings['ui']['default_theme']
templates_path = settings['ui']['templates_path']
themes = get_themes(templates_path)
result_templates = get_result_templates(templates_path)
global_favicons = []
for indice, theme in enumerate(themes):
    global_favicons.append([])
    theme_img_path = os.path.join(settings['ui']['static_path'], 'themes',
                                  theme, 'img', 'icons')
    for (dirpath, dirnames, filenames) in os.walk(theme_img_path):
        global_favicons[indice].extend(filenames)
Ejemplo n.º 30
0
 def engine_init():
     init_fn()
     logger.debug('%s engine initialized', engine_data['name'])
Ejemplo n.º 31
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not find 'answer'")

        # results --> number_of_results
        try:
            _txt = eval_xpath_getindex(dom,
                                       '//div[@id="result-stats"]//text()', 0)
            _digit = ''.join([n for n in _txt if n.isdigit()])
            number_of_results = int(_digit)
            results.append({'number_of_results': number_of_results})
        except Exception as e:  # pylint: disable=broad-except
            logger.debug("did not 'number_of_results'")
            logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            results.append({'url': url, 'title': title, 'content': content})
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Ejemplo n.º 32
0
def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(
            dom.xpath('//div[@id="resultStats"]//text()')[0].split()
            [1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url),
                            google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result,
                                                     content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            logger.debug('result parse error in:\n%s',
                         etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Ejemplo n.º 33
0
def request(query, params):
    skip = (params['pageno'] - 1) * 20
    query = urlencode({'q': query, 'skip': skip})
    params['url'] = search_url.format(query=query)
    logger.debug("query_url --> %s", params['url'])
    return params
Ejemplo n.º 34
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

    # results --> number_of_results
    try:
        _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0]
        _digit = ''.join([n for n in _txt if n.isdigit()])
        number_of_results = int(_digit)
        results.append({'number_of_results': number_of_results})

    except Exception as e:  # pylint: disable=broad-except
        logger.debug("did not 'number_of_results'")
        logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath(result, title_xpath)
            if not title_tag:
                # this not one of the common google results *section*
                logger.debug('ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag[0])
            url = eval_xpath(result, href_xpath)[0]
            content = extract_text_from_dom(result, content_xpath)
            results.append({
                'url': url,
                'title': title,
                'content': content
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Ejemplo n.º 35
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
        answer_list = [_.xpath("normalize-space()") for _ in answer_list]
        results.append({'answer': ' '.join(answer_list)})
    else:
        logger.debug("did not find 'answer'")

        # results --> number_of_results
        if not use_mobile_ui:
            try:
                _txt = eval_xpath_getindex(
                    dom, '//div[@id="result-stats"]//text()', 0)
                _digit = ''.join([n for n in _txt if n.isdigit()])
                number_of_results = int(_digit)
                results.append({'number_of_results': number_of_results})
            except Exception as e:  # pylint: disable=broad-except
                logger.debug("did not 'number_of_results'")
                logger.error(e, exc_info=True)

    # parse results

    _results_xpath = results_xpath
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring item from the result_xpath list: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            if content is None:
                logger.debug(
                    'ingoring item from the result_xpath list: missing content of title "%s"',
                    title)
                continue

            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results