def search_multiple_requests(self, requests): search_id = uuid4().__str__() for engine_name, query, request_params in requests: th = threading.Thread( target=processors[engine_name].search, args=(query, request_params, self.result_container, self.start_time, self.actual_timeout), name=search_id, ) th._timeout = False th._engine_name = engine_name th.start() for th in threading.enumerate(): if th.name == search_id: remaining_time = max( 0.0, self.actual_timeout - (time() - self.start_time)) th.join(remaining_time) if th.is_alive(): th._timeout = True self.result_container.add_unresponsive_engine( th._engine_name, 'timeout') logger.warning('engine timeout: {0}'.format( th._engine_name))
def search_multiple_requests(requests, result_container, start_time, timeout_limit): search_id = uuid4().__str__() for engine_name, query, request_params in requests: th = threading.Thread( target=search_one_request_safe, args=( engine_name, query, request_params, result_container, start_time, timeout_limit, ), name=search_id, ) th._timeout = False th._engine_name = engine_name th.start() for th in threading.enumerate(): if th.name == search_id: remaining_time = max(0.0, timeout_limit - (time() - start_time)) th.join(remaining_time) if th.is_alive(): th._timeout = True result_container.add_unresponsive_engine(th._engine_name, "timeout") logger.warning("engine timeout: {0}".format(th._engine_name))
def search_multiple_requests(requests, result_container, start_time, timeout_limit): from searx.webapp import sentry search_id = uuid4().__str__() for engine_name, query, request_params in requests: th = threading.Thread( target=search_one_request_safe, args=(engine_name, query, request_params, result_container, start_time, timeout_limit), name=search_id, ) th._engine_name = engine_name th.start() for th in threading.enumerate(): if th.name == search_id: remaining_time = max(0.0, timeout_limit - (time() - start_time)) th.join(remaining_time) if th.isAlive(): result_container.add_unresponsive_engine( (th._engine_name, gettext('timeout'))) logger.warning('engine timeout: {0}'.format(th._engine_name)) sentry.captureMessage('engine timeout: {0}'.format( th._engine_name))
async def arequest(self, method, url, headers=None, stream=None, ext=None): retry = 2 while retry > 0: retry -= 1 try: return await super().arequest(method, url, headers, stream, ext) except (python_socks._errors.ProxyConnectionError, python_socks._errors.ProxyTimeoutError, python_socks._errors.ProxyError) as e: raise httpcore.ProxyError(e) except OSError as e: # socket.gaierror when DNS resolution fails raise httpcore.NetworkError(e) except httpcore.RemoteProtocolError as e: # in case of httpcore.RemoteProtocolError: Server disconnected await close_connections_for_url(self, url) logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e) # retry except (httpcore.NetworkError, httpcore.ProtocolError) as e: # httpcore.WriteError on HTTP/2 connection leaves a new opened stream # then each new request creates a new stream and raise the same WriteError await close_connections_for_url(self, url) raise e
def get_client_id(): response = http_get("https://soundcloud.com") rx_namespace = {"re": "http://exslt.org/regular-expressions"} if response.ok: tree = etree.parse(StringIO(response.content), etree.HTMLParser()) script_tags = tree.xpath("//script[re:match(@src, '(.*app.*js)')]", namespaces=rx_namespace) app_js_urls = [ script_tag.get('src') for script_tag in script_tags if script_tag is not None ] # extracts valid app_js urls from soundcloud.com content for app_js_url in app_js_urls: # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: cids = re.search(r'client_id:"([^"]*)"', response.content, re.M | re.I) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning( "Unable to fetch guest client_id from SoundCloud, check parser!") return ""
async def arequest(self, method, url, headers=None, stream=None, ext=None): retry = 2 while retry > 0: retry -= 1 try: return await super().arequest(method, url, headers, stream, ext) except OSError as e: # socket.gaierror when DNS resolution fails raise httpcore.ConnectError(e) except httpcore.CloseError as e: # httpcore.CloseError: [Errno 104] Connection reset by peer # raised by _keepalive_sweep() # from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa await close_connections_for_url(self._pool, url) logger.warning('httpcore.CloseError: retry', exc_info=e) # retry except httpcore.RemoteProtocolError as e: # in case of httpcore.RemoteProtocolError: Server disconnected await close_connections_for_url(self._pool, url) logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e) # retry except (httpcore.ProtocolError, httpcore.NetworkError) as e: await close_connections_for_url(self._pool, url) raise e
async def close_connections_for_url( connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL): origin = httpcore._utils.url_to_origin(url) logger.debug('Drop connections for %r', origin) connections_to_close = connection_pool._connections_for_origin(origin) for connection in connections_to_close: await connection_pool._remove_from_pool(connection) try: await connection.aclose() except httpcore.NetworkError as e: logger.warning('Error closing an existing connection', exc_info=e)
async def close_connections_for_url( connection_pool: httpcore.AsyncConnectionPool, url: httpx._models.URL): logger.debug('Drop connections for %r', url.host) connections_to_close = [ conn for conn in connection_pool._pool if conn._origin == url.host ] for connection in connections_to_close: connection_pool._pool.remove(connection) try: await connection.aclose() except httpx.NetworkError as e: logger.warning('Error closing an existing connection', exc_info=e)
def format_date_by_locale(date_string, locale_string): # strftime works only on dates after 1900 parsed_date = dateutil.parser.parse(date_string) if parsed_date.year <= 1900: return parsed_date.isoformat().split('T')[0] orig_locale = locale.getlocale()[0] try: locale.setlocale(locale.LC_ALL, locale_string) except: logger.warning('cannot set locale: {0}'.format(locale_string)) formatted_date = parsed_date.strftime(locale.nl_langinfo(locale.D_FMT)) try: locale.setlocale(locale.LC_ALL, orig_locale) except: logger.warning('cannot set original locale: {0}'.format(orig_locale)) return formatted_date
def get_client_id(): response = http_get("https://soundcloud.com") if response.ok: tree = html.fromstring(response.content) script_tags = tree.xpath("//script[contains(@src, '/assets/app')]") app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] # extracts valid app_js urls from soundcloud.com content for app_js_url in app_js_urls: # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: cids = cid_re.search(response.text) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") return ""
def search_multiple_requests(requests, result_container, start_time, timeout_limit): search_id = uuid4().__str__() for engine_name, query, request_params in requests: th = threading.Thread( target=search_one_request_safe, args=(engine_name, query, request_params, result_container, start_time, timeout_limit), name=search_id, ) th._engine_name = engine_name th.start() for th in threading.enumerate(): if th.name == search_id: remaining_time = max(0.0, timeout_limit - (time() - start_time)) th.join(remaining_time) if th.isAlive(): logger.warning('engine timeout: {0}'.format(th._engine_name))
def search_multiple_requests(requests, result_container, timeout_limit): start_time = time() search_id = uuid4().__str__() for engine_name, query, request_params in requests: th = threading.Thread( target=search_one_request, args=(engine_name, query, request_params, result_container, timeout_limit), name=search_id, ) th._engine_name = engine_name th.start() for th in threading.enumerate(): if th.name == search_id: remaining_time = max(0.0, timeout_limit - (time() - start_time)) th.join(remaining_time) if th.isAlive(): logger.warning('engine timeout: {0}'.format(th._engine_name))
def get_client_id(): response = http_get("https://soundcloud.com") rx_namespace = {"re": "http://exslt.org/regular-expressions"} if response.ok: tree = etree.parse(StringIO(response.content), etree.HTMLParser()) script_tags = tree.xpath("//script[re:match(@src, '(.*app.*js)')]", namespaces=rx_namespace) app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] # extracts valid app_js urls from soundcloud.com content for app_js_url in app_js_urls: # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: cids = re.search(r'client_id:"([^"]*)"', response.content, re.M | re.I) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") return ""
def threaded_requests(requests): timeout_limit = max(r[2]['timeout'] for r in requests) search_start = time() for fn, url, request_args, engine_name in requests: request_args['timeout'] = timeout_limit th = threading.Thread( target=search_request_wrapper, args=(fn, url, engine_name), kwargs=request_args, name='search_request', ) th._engine_name = engine_name th.start() for th in threading.enumerate(): if th.name == 'search_request': remaining_time = max(0.0, timeout_limit - (time() - search_start)) th.join(remaining_time) if th.isAlive(): logger.warning('engine timeout: {0}'.format(th._engine_name))
async def handle_async_request(self, request: httpx.Request): retry = 2 while retry > 0: retry -= 1 try: return await super().handle_async_request(request) except (ProxyConnectionError, ProxyTimeoutError, ProxyError) as e: raise httpx.ProxyError(e) except OSError as e: # socket.gaierror when DNS resolution fails raise httpx.NetworkError(e) except httpx.RemoteProtocolError as e: # in case of httpx.RemoteProtocolError: Server disconnected await close_connections_for_url(self, request.url) logger.warning('httpx.RemoteProtocolError: retry', exc_info=e) # retry except (httpx.NetworkError, httpx.ProtocolError) as e: # httpx.WriteError on HTTP/2 connection leaves a new opened stream # then each new request creates a new stream and raise the same WriteError await close_connections_for_url(self, request.url) raise e
def pre_request(): # merge GET, POST vars preferences = Preferences(themes, categories.keys(), engines, plugins) try: preferences.parse_cookies(request.cookies) except: # TODO throw error message to the user logger.warning('Invalid config') request.preferences = preferences request.form = dict(request.form.items()) for k, v in request.args.items(): if k not in request.form: request.form[k] = v request.user_plugins = [] allowed_plugins = preferences.plugins.get_enabled() disabled_plugins = preferences.plugins.get_disabled() for plugin in plugins: if ((plugin.default_on and plugin.id not in disabled_plugins) or plugin.id in allowed_plugins): request.user_plugins.append(plugin)
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, results_xpath): single_result = {'template': template} for single_field in field_definition: single_field = {**default_field_settings, **single_field} try: if single_field['single_element']: node = eval_xpath(result, single_field['xpath']) else: node = eval_xpath_list(result, single_field['xpath']) if 'extract' in single_field and single_field[ 'extract'] == 'url': value = extract_url(node, search_url) elif 'extract' in single_field and single_field[ 'extract'] == 'boolean': value = (isinstance(node, list) and len(node) > 0) elif 'extract' in single_field and single_field[ 'extract'] == 'boolean_negate': value = (isinstance(node, list) and len(node) < 1) else: value = extract_text(node) single_result[single_field['field_name']] = value except Exception as e: logger.warning('error in resolving field %s:\n%s', single_field['field_name'], e) single_result[single_field['field_name']] = unresolvable_value results.append(single_result) return results
def get_client_id(): response = http_get("https://soundcloud.com") if response.ok: tree = html.fromstring(response.content) # script_tags has been moved from /assets/app/ to /assets/ path. I # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js script_tags = tree.xpath("//script[contains(@src, '/assets/')]") app_js_urls = [ script_tag.get('src') for script_tag in script_tags if script_tag is not None ] # extracts valid app_js urls from soundcloud.com content for app_js_url in app_js_urls[::-1]: # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: cids = cid_re.search(response.content.decode()) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning( "Unable to fetch guest client_id from SoundCloud, check parser!") return ""
def get_proxy(): try: args = settings['outgoing']['haipproxy_redis'] fetcher = ProxyFetcher('zhihu', strategy='greedy', redis_args=args) proxy = fetcher.get_proxy() if proxy: return {'http': proxy} else: logger.warning('No available proxy fetched from the proxy pool.') except Exception: logger.warning('Exception in fetching proxy.') logger.warning(traceback.print_exc())
except ImportError: logger.critical("The pyopenssl package has to be installed.\n" "Some HTTPS connections will fail") try: from cStringIO import StringIO except: from io import StringIO if sys.version_info[0] == 3: unicode = str PY3 = True else: PY3 = False logger.warning('\033[1;31m *** Deprecation Warning ***\033[0m') logger.warning('\033[1;31m Python2 is deprecated\033[0m') # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) # about static static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path']) logger.debug('static directory is %s', static_path) static_files = get_static_files(static_path) # about templates default_theme = settings['ui']['default_theme'] templates_path = get_resources_directory(searx_dir, 'templates', settings['ui']['templates_path']) logger.debug('templates directory is %s', templates_path)
try: import OpenSSL.SSL # NOQA except ImportError: logger.critical("The pyopenssl package has to be installed.\n" "Some HTTPS connections will fail") try: from cStringIO import StringIO except: from io import StringIO if sys.version_info[0] == 3: unicode = str PY3 = True else: logger.warning('\033[1;31m Python2 is no longer supported\033[0m') exit(1) # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get( 'http_protocol_version', '1.0')) # about static static_path = get_resources_directory(searx_dir, 'static', settings['ui']['static_path']) logger.debug('static directory is %s', static_path) static_files = get_static_files(static_path) # about templates default_theme = settings['ui']['default_theme']