def image_proxy(): url = request.args.get("url").encode() if not url: return "", 400 h = new_hmac(settings["server"]["secret_key"], url) if h != request.args.get("h"): return "", 400 headers = dict_subset(request.headers, {"If-Modified-Since", "If-None-Match"}) headers["User-Agent"] = gen_useragent() resp = requests.get( url, stream=True, timeout=settings["outgoing"]["request_timeout"], headers=headers, proxies=get_global_proxies(), ) if resp.status_code == 304: return "", resp.status_code if resp.status_code != 200: logger.debug("image-proxy: wrong response code: {0}".format( resp.status_code)) if resp.status_code >= 400: return "", resp.status_code return "", 400 if not resp.headers.get("content-type", "").startswith("image/"): logger.debug("image-proxy: wrong content-type: {0}".format( resp.headers.get("content-type"))) return "", 400 img = b"" chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return "", 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset( resp.headers, { "Content-Length", "Length", "Date", "Last-Modified", "Expires", "Etag" }, ) return Response(img, mimetype=resp.headers["content-type"], headers=headers)
def image_proxy(): url = request.args.get('url') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url.encode()) if h != request.args.get('h'): return '', 400 maximum_size = 5 * 1024 * 1024 try: headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() stream = http_stream( method='GET', url=url, headers=headers, timeout=settings['outgoing']['request_timeout'], allow_redirects=True, max_redirects=20) resp = next(stream) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit() and int(content_length) > maximum_size: return 'Max size', 400 if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) return '', 400 headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) total_length = 0 def forward_chunk(): nonlocal total_length for chunk in stream: total_length += len(chunk) if total_length > maximum_size: break yield chunk return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) except httpx.HTTPError: return '', 400
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = hashlib.sha256( url + settings['server']['secret_key'].encode('utf-8')).hexdigest() if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = requests.get(url, stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format( resp.headers.get('content-type'))) return '', 400 img = '' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, { 'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag' }) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest() if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = requests.get(url, stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) return '', 400 img = '' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def image_proxy(): url = request.args.get("url").encode("utf-8") if not url: return "", 400 h = hashlib.sha256(url + settings["server"]["secret_key"].encode("utf-8")).hexdigest() if h != request.args.get("h"): return "", 400 headers = dict_subset(request.headers, {"If-Modified-Since", "If-None-Match"}) headers["User-Agent"] = gen_useragent() resp = requests.get( url, stream=True, timeout=settings["outgoing"]["request_timeout"], headers=headers, proxies=outgoing_proxies ) if resp.status_code == 304: return "", resp.status_code if resp.status_code != 200: logger.debug("image-proxy: wrong response code: {0}".format(resp.status_code)) if resp.status_code >= 400: return "", resp.status_code return "", 400 if not resp.headers.get("content-type", "").startswith("image/"): logger.debug("image-proxy: wrong content-type: {0}".format(resp.headers.get("content-type"))) return "", 400 img = "" chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return "", 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {"Content-Length", "Length", "Date", "Last-Modified", "Expires", "Etag"}) return Response(img, mimetype=resp.headers["content-type"], headers=headers)
def response(resp): headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) get(url_ping, headers=headers_ping) if resp.status_code == 303: return [] results = [] doc = fromstring(resp.text) result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') if not len(result_table) >= 3: # no more results return [] result_table = result_table[2] tr_rows = eval_xpath(result_table, './/tr') # In the last <tr> is the form of the 'previous/next page' links tr_rows = tr_rows[:-1] len_tr_rows = len(tr_rows) offset = 0 while len_tr_rows >= offset + 4: # assemble table rows we need to scrap tr_title = tr_rows[offset] tr_content = tr_rows[offset + 1] offset += 4 # ignore sponsored Adds <tr class="result-sponsored"> if tr_content.get('class') == 'result-sponsored': continue a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) if a_tag is None: continue td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) if td_content is None: continue results.append({ 'title': a_tag.text_content(), 'content': extract_text(td_content), 'url': a_tag.get('href'), }) return results
def response(resp): if resp.status_code == 303: return [] # ping headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) get(url_ping, headers=headers_ping) # parse the response results = [] doc = fromstring(resp.text) for i, r in enumerate(eval_xpath(doc, result_xpath)): if i >= 30: break try: res_url = eval_xpath(r, url_xpath)[-1] except: continue if not res_url: continue title = extract_text(eval_xpath(r, title_xpath)) content = extract_text(eval_xpath(r, content_xpath)) # append result results.append({'title': title, 'content': content, 'url': res_url}) # parse correction for correction in eval_xpath(doc, correction_xpath): # append correction results.append({'correction': extract_text(correction)}) # return results return results
def image_proxy(): # pylint: disable=too-many-return-statements, too-many-branches url = request.args.get('url') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url.encode()) if h != request.args.get('h'): return '', 400 maximum_size = 5 * 1024 * 1024 forward_resp = False resp = None try: request_headers = { 'User-Agent': gen_useragent(), 'Accept': 'image/webp,*/*', 'Accept-Encoding': 'gzip, deflate', 'Sec-GPC': '1', 'DNT': '1', } set_context_network_name('image_proxy') stream = http_stream(method='GET', url=url, headers=request_headers, timeout=settings['outgoing']['request_timeout'], follow_redirects=True, max_redirects=20) resp = next(stream) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit( ) and int(content_length) > maximum_size: return 'Max size', 400 if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('Content-Type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) return '', 400 forward_resp = True except httpx.HTTPError: logger.exception('HTTP error') return '', 400 finally: if resp and not forward_resp: # the code is about to return an HTTP 400 error to the browser # we make sure to close the response between searxng and the HTTP server try: resp.close() except httpx.HTTPError: logger.exception('HTTP error on closing') try: headers = dict_subset( resp.headers, {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'}) def forward_chunk(): total_length = 0 for chunk in stream: total_length += len(chunk) if total_length > maximum_size: break yield chunk return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) except httpx.HTTPError: return '', 400