def get_time(claims, propertyName, locale, defaultValue=None): propValue = claims.get(propertyName, {}) if len(propValue) == 0: return defaultValue result = [] for e in propValue: mainsnak = e.get('mainsnak', {}) datavalue = mainsnak.get('datavalue', {}) if datavalue is not None: value = datavalue.get('value', '') result.append(value.get('time', '')) if len(result) == 0: date_string = defaultValue else: date_string = ', '.join(result) try: parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") except: if date_string.startswith('-'): return date_string.split('T')[0] try: parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) except: logger.debug('could not parse date %s', date_string) return date_string.split('T')[0] return format_date_by_locale(parsed_date, locale)
def process_callback(response, **kwargs): # check if redirect comparing to the True value, # because resp can be a Mock object, and any attribut name returns something. if response.is_redirect is True: logger.debug('{0} redirect on: {1}'.format(engine_name, response)) return response.search_params = params timeout_overhead = 0.2 # seconds search_duration = time() - params['started'] timeout_limit = engines[engine_name].timeout + timeout_overhead if search_duration > timeout_limit: engines[engine_name].stats['page_load_time'] += timeout_limit engines[engine_name].stats['errors'] += 1 return # callback search_results = callback(response) # add results for result in search_results: result['engine'] = engine_name results_queue.put_nowait((engine_name, search_results)) # update stats with current page-load-time engines[engine_name].stats['page_load_time'] += search_duration
def run(): logger.debug('starting webserver on %s:%s', settings['server']['port'], settings['server']['bind_address']) app.run( debug=searx_debug, use_debugger=searx_debug, port=settings['server']['port'], host=settings['server']['bind_address'], threaded=True )
def initialize_engines(engine_list): load_engines(engine_list) for engine in engines.items(): if hasattr(engine, 'init'): init_fn = getattr(engine, engine_attr) def engine_init(): init_fn() logger.debug('%s engine initialized', engine_data['name']) logger.debug('Starting background initialization of %s engine', engine_data['name']) threading.Thread(target=engine_init).start()
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest() if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = requests.get(url, stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) return '', 400 img = '' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def code_highlighter(codelines, language=None): if not language: language = 'text' try: # find lexer by programing language lexer = get_lexer_by_name(language, stripall=True) except: # if lexer is not found, using default one logger.debug('highlighter cannot find lexer for {0}'.format(language)) lexer = get_lexer_by_name('text', stripall=True) html_code = '' tmp_code = '' last_line = None # parse lines for line, code in codelines: if not last_line: line_code_start = line # new codeblock is detected if last_line is not None and\ last_line + 1 != line: # highlight last codepart formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start) html_code = html_code + highlight(tmp_code, lexer, formatter) # reset conditions for next codepart tmp_code = '' line_code_start = line # add codepart tmp_code += code + '\n' # update line last_line = line # highlight last codepart formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start) html_code = html_code + highlight(tmp_code, lexer, formatter) return html_code
def image_proxy(): url = request.args.get("url").encode("utf-8") if not url: return "", 400 h = hashlib.sha256(url + settings["server"]["secret_key"].encode("utf-8")).hexdigest() if h != request.args.get("h"): return "", 400 headers = dict_subset(request.headers, {"If-Modified-Since", "If-None-Match"}) headers["User-Agent"] = gen_useragent() resp = requests.get( url, stream=True, timeout=settings["outgoing"]["request_timeout"], headers=headers, proxies=outgoing_proxies ) if resp.status_code == 304: return "", resp.status_code if resp.status_code != 200: logger.debug("image-proxy: wrong response code: {0}".format(resp.status_code)) if resp.status_code >= 400: return "", resp.status_code return "", 400 if not resp.headers.get("content-type", "").startswith("image/"): logger.debug("image-proxy: wrong content-type: {0}".format(resp.headers.get("content-type"))) return "", 400 img = "" chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return "", 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {"Content-Length", "Length", "Date", "Last-Modified", "Expires", "Etag"}) return Response(img, mimetype=resp.headers["content-type"], headers=headers)
def response(resp): '''post-response callback resp: requests response object ''' results = [] dom = html.fromstring(resp.text) try: number_of_results_string = re.sub('[^0-9]', '', dom.xpath( '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] ) results.append({'number_of_results': int(number_of_results_string)}) except: logger.debug("Couldn't read number of results.") pass for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): try: logger.debug("running for %s" % str(result)) link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = result.xpath('string(.//h2/a)') content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, 'title': title, 'content': content}) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue return results
from io import StringIO if sys.version_info[0] == 3: unicode = str PY3 = True else: PY3 = False # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) # about static static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path']) logger.debug('static directory is %s', static_path) static_files = get_static_files(static_path) # about templates default_theme = settings['ui']['default_theme'] templates_path = get_resources_directory(searx_dir, 'templates', settings['ui']['templates_path']) logger.debug('templates directory is %s', templates_path) themes = get_themes(templates_path) result_templates = get_result_templates(templates_path) global_favicons = [] for indice, theme in enumerate(themes): global_favicons.append([]) theme_img_path = os.path.join(static_path, 'themes', theme, 'img', 'icons') for (dirpath, dirnames, filenames) in os.walk(theme_img_path): global_favicons[indice].extend(filenames)
def search(self): global number_of_searches # start time start_time = time() # answeres ? answerers_results = ask(self.search_query) if answerers_results: for results in answerers_results: self.result_container.extend('answer', results) return self.result_container # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # max of all selected engine timeout timeout_limit = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range # append request to list requests.append((selected_engine['name'], search_query.query, request_params)) # update timeout_limit timeout_limit = max(timeout_limit, engine.timeout) if requests: # send all search-request search_multiple_requests(requests, self.result_container, start_time, timeout_limit) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def search(self): global number_of_searches # start time start_time = time() # answeres ? answerers_results = ask(self.search_query) if answerers_results: for results in answerers_results: self.result_container.extend('answer', results) return self.result_container # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # max of all selected engine timeout default_timeout = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if not search_query.preferences.validate_token(engine): continue # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = {} if not engine.offline: request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range request_params['category'] = selected_engine['category'] request_params['pageno'] = search_query.pageno # append request to list requests.append((selected_engine['name'], search_query.query, request_params)) # update default_timeout default_timeout = max(default_timeout, engine.timeout) # adjust timeout self.actual_timeout = default_timeout query_timeout = self.search_query.timeout_limit if max_request_timeout is None and query_timeout is None: # No max, no user query: default_timeout pass elif max_request_timeout is None and query_timeout is not None: # No max, but user query: From user query except if above default self.actual_timeout = min(default_timeout, query_timeout) elif max_request_timeout is not None and query_timeout is None: # Max, no user query: Default except if above max self.actual_timeout = min(default_timeout, max_request_timeout) elif max_request_timeout is not None and query_timeout is not None: # Max & user query: From user query except if above max self.actual_timeout = min(query_timeout, max_request_timeout) logger.debug("actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})" .format(self.actual_timeout, default_timeout, query_timeout, max_request_timeout)) # send all search-request if requests: search_multiple_requests(requests, self.result_container, start_time, self.actual_timeout) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def request(query, params): params['url'] = search_url + urlencode({ 'query': query, 'page': params['pageno'], 'per_page': page_size }) logger.debug("query_url --> %s", params['url']) return params
def dump(self): logger.debug("Histograms:") ks = sorted(self.measures.keys(), key='/'.join) for k in ks: logger.debug("- %-60s %s", '|'.join(k), self.measures[k])
def search(self): global number_of_searches # start time start_time = time() # answeres ? answerers_results = ask(self.search_query) if answerers_results: for results in answerers_results: self.result_container.extend('answer', results) return self.result_container # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # max of all selected engine timeout timeout_limit = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range # append request to list requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params)) # update timeout_limit timeout_limit = max(timeout_limit, engine.timeout) if requests: # send all search-request search_multiple_requests(requests, self.result_container, start_time, timeout_limit) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def response(resp): results = [] search_res = json.loads(resp.text) # search_res.get('Entity') possible values (not exhaustive) : # * continent / country / department / location / waterfall # * actor / musician / artist # * book / performing art / film / television / media franchise / concert tour / playwright # * prepared food # * website / software / os / programming language / file format / software engineer # * compagny content = '' heading = search_res.get('Heading', '') attributes = [] urls = [] infobox_id = None relatedTopics = [] # add answer if there is one answer = search_res.get('Answer', '') if answer: logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer) if search_res.get('AnswerType') not in ['calc', 'ip']: results.append({'answer': html_to_text(answer)}) # add infobox if 'Definition' in search_res: content = content + search_res.get('Definition', '') if 'Abstract' in search_res: content = content + search_res.get('Abstract', '') # image image = search_res.get('Image') image = None if image == '' else image if image is not None and urlparse(image).netloc == '': image = urljoin('https://duckduckgo.com', image) # urls # Official website, Wikipedia page for ddg_result in search_res.get('Results', []): firstURL = ddg_result.get('FirstURL') text = ddg_result.get('Text') if firstURL is not None and text is not None: urls.append({'title': text, 'url': firstURL}) results.append({'title': heading, 'url': firstURL}) # related topics for ddg_result in search_res.get('RelatedTopics', []): if 'FirstURL' in ddg_result: firstURL = ddg_result.get('FirstURL') text = ddg_result.get('Text') if not is_broken_text(text): suggestion = result_to_text(text, ddg_result.get('Result')) if suggestion != heading and suggestion is not None: results.append({'suggestion': suggestion}) elif 'Topics' in ddg_result: suggestions = [] relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions}) for topic_result in ddg_result.get('Topics', []): suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result')) if suggestion != heading and suggestion is not None: suggestions.append(suggestion) # abstract abstractURL = search_res.get('AbstractURL', '') if abstractURL != '': # add as result ? problem always in english infobox_id = abstractURL urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True}) results.append({'url': abstractURL, 'title': heading}) # definition definitionURL = search_res.get('DefinitionURL', '') if definitionURL != '': # add as result ? as answer ? problem always in english infobox_id = definitionURL urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) # to merge with wikidata's infobox if infobox_id: infobox_id = replace_http_by_https(infobox_id) # attributes # some will be converted to urls if 'Infobox' in search_res: infobox = search_res.get('Infobox') if 'content' in infobox: osm_zoom = 17 coordinates = None for info in infobox.get('content'): data_type = info.get('data_type') data_label = info.get('label') data_value = info.get('value') # Workaround: ddg may return a double quote if data_value == '""': continue # Is it an external URL ? # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id # * netflix_id external_url = get_external_url(data_type, data_value) if external_url is not None: urls.append({'title': data_label, 'url': external_url}) elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']: # ignore instance: Wikidata value from "Instance Of" (Qxxxx) # ignore wiki_maps_trigger: reference to a javascript # ignore google_play_artist_id: service shutdown pass elif data_type == 'string' and data_label == 'Website': # There is already an URL for the website pass elif data_type == 'area': attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'}) osm_zoom = area_to_osm_zoom(data_value.get('amount')) elif data_type == 'coordinates': if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2': # coordinate on Earth # get the zoom information from the area coordinates = info else: # coordinate NOT on Earth attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'}) elif data_type == 'string': attributes.append({'label': data_label, 'value': data_value}) if coordinates: data_label = coordinates.get('label') data_value = coordinates.get('value') latitude = data_value.get('latitude') longitude = data_value.get('longitude') url = get_earth_coordinates_url(latitude, longitude, osm_zoom) urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'}) if len(heading) > 0: # TODO get infobox.meta.value where .label='article_title' if image is None and len(attributes) == 0 and len(urls) == 1 and\ len(relatedTopics) == 0 and len(content) == 0: results.append({'url': urls[0]['url'], 'title': heading, 'content': content}) else: results.append({'infobox': heading, 'id': infobox_id, 'content': content, 'img_src': image, 'attributes': attributes, 'urls': urls, 'relatedTopics': relatedTopics}) return results
def _run_with_delay(): every = _get_every() delay = random.randint(0, every[1] - every[0]) logger.debug('Start checker in %i seconds', delay) time.sleep(delay) run()
def engine_init(engine_name, init_fn): init_fn(get_engine_from_settings(engine_name)) logger.debug('%s engine: Initialized', engine_name)
def request(query, params): params['url'] = SEARCH_URL.format(query=query) logger.debug(f"query_url --> {params['url']}") return params
def engine_init(): init_fn() logger.debug('%s engine initialized', engine_name)
def add_error_context(engine_name: str, error_context: ErrorContext) -> None: errors_for_engine = errors_per_engines.setdefault(engine_name, {}) errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1 logger.debug('%s: %s', engine_name, str(error_context))
def dump(self): with self.lock: ks = sorted(self.counters.keys(), key='/'.join) logger.debug("Counters:") for k in ks: logger.debug("- %-60s %s", '|'.join(k), self.counters[k])
def engine_init(): init_fn() logger.debug('%s engine initialized', engine_data['name'])
def engine_init(engine_name, init_fn): init_fn() logger.debug('%s engine: Initialized', engine_name)
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = dom.xpath('//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0] .split()[1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = result.xpath(map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in dom.xpath(spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def image_proxy(): # pylint: disable=too-many-return-statements, too-many-branches url = request.args.get('url') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url.encode()) if h != request.args.get('h'): return '', 400 maximum_size = 5 * 1024 * 1024 forward_resp = False resp = None try: request_headers = { 'User-Agent': gen_useragent(), 'Accept': 'image/webp,*/*', 'Accept-Encoding': 'gzip, deflate', 'Sec-GPC': '1', 'DNT': '1', } set_context_network_name('image_proxy') stream = http_stream(method='GET', url=url, headers=request_headers, timeout=settings['outgoing']['request_timeout'], follow_redirects=True, max_redirects=20) resp = next(stream) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit( ) and int(content_length) > maximum_size: return 'Max size', 400 if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('Content-Type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) return '', 400 forward_resp = True except httpx.HTTPError: logger.exception('HTTP error') return '', 400 finally: if resp and not forward_resp: # the code is about to return an HTTP 400 error to the browser # we make sure to close the response between searxng and the HTTP server try: resp.close() except httpx.HTTPError: logger.exception('HTTP error on closing') try: headers = dict_subset( resp.headers, {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'}) def forward_chunk(): total_length = 0 for chunk in stream: total_length += len(chunk) if total_length > maximum_size: break yield chunk return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) except httpx.HTTPError: return '', 400
def image_proxy(): url = request.args.get('url') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url.encode()) if h != request.args.get('h'): return '', 400 maximum_size = 5 * 1024 * 1024 try: headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() stream = http_stream(method='GET', url=url, headers=headers, timeout=settings['outgoing']['request_timeout'], allow_redirects=True, max_redirects=20) resp = next(stream) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit( ) and int(content_length) > maximum_size: return 'Max size', 400 if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format( resp.headers.get('content-type'))) return '', 400 headers = dict_subset(resp.headers, { 'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag' }) total_length = 0 def forward_chunk(): nonlocal total_length for chunk in stream: total_length += len(chunk) if total_length > maximum_size: break yield chunk return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) except httpx.HTTPError: return '', 400
def search(self): global number_of_searches # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if search_query.lang != 'all' and not engine.language_support: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range # update request parameters dependent on # search-engine (contained in engines folder) engine.request(search_query.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], engine.response, request_params, self.result_container) # create dictionary which contain all # informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify'] ) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return self.result_container # send all search-request threaded_requests(requests) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
from searx.search import SearchWithPlugins, initialize as search_initialize from searx.network import stream as http_stream from searx.search.checker import get_result as checker_get_result from searx.settings_loader import get_default_settings_path logger = logger.getChild('webapp') # check secret_key if not searx_debug and settings['server']['secret_key'] == 'ultrasecretkey': logger.error( 'server.secret_key is not changed. Please use something else instead of ultrasecretkey.' ) sys.exit(1) # about static logger.debug('static directory is %s', settings['ui']['static_path']) static_files = get_static_files(settings['ui']['static_path']) # about templates logger.debug('templates directory is %s', settings['ui']['templates_path']) default_theme = settings['ui']['default_theme'] templates_path = settings['ui']['templates_path'] themes = get_themes(templates_path) result_templates = get_result_templates(templates_path) global_favicons = [] for indice, theme in enumerate(themes): global_favicons.append([]) theme_img_path = os.path.join(settings['ui']['static_path'], 'themes', theme, 'img', 'icons') for (dirpath, dirnames, filenames) in os.walk(theme_img_path): global_favicons[indice].extend(filenames)
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not find 'answer'") # results --> number_of_results try: _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = dom.xpath('//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: results_num = int( dom.xpath('//div[@id="resultStats"]//text()')[0].split() [1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = result.xpath(map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({ 'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in dom.xpath(spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def request(query, params): skip = (params['pageno'] - 1) * 20 query = urlencode({'q': query, 'skip': skip}) params['url'] = search_url.format(query=query) logger.debug("query_url --> %s", params['url']) return params
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0] _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath(result, title_xpath) if not title_tag: # this not one of the common google results *section* logger.debug('ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag[0]) url = eval_xpath(result, href_xpath)[0] content = extract_text_from_dom(result, content_xpath) results.append({ 'url': url, 'title': title, 'content': content }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: answer_list = [_.xpath("normalize-space()") for _ in answer_list] results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") # results --> number_of_results if not use_mobile_ui: try: _txt = eval_xpath_getindex( dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results _results_xpath = results_xpath if use_mobile_ui: _results_xpath = results_xpath_mobile_ui for result in eval_xpath_list(dom, _results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) if content is None: logger.debug( 'ingoring item from the result_xpath list: missing content of title "%s"', title) continue logger.debug('add link to results: %s', title) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results