def test_bytes(self): for secret_key in ['secret', b'secret', 1]: if secret_key == 1: with self.assertRaises(TypeError): utils.new_hmac(secret_key, b'http://example.com') continue res = utils.new_hmac(secret_key, b'http://example.com') self.assertEqual( res, '23e2baa2404012a5cc8e4a18b4aabf0dde4cb9b56f679ddc0fd6d7c24339d819')
def image_proxify(url): if url.startswith('//'): url = 'https:' + url if not request.preferences.get_value('image_proxy'): return url if url.startswith('data:image/'): # 50 is an arbitrary number to get only the beginning of the image. partial_base64 = url[len('data:image/'):50].split(';') if len(partial_base64) == 2 \ and partial_base64[0] in ['gif', 'png', 'jpeg', 'pjpeg', 'webp', 'tiff', 'bmp']\ and partial_base64[1].startswith('base64,'): return url else: return None if settings.get('result_proxy'): return proxify(url) h = new_hmac(settings['server']['secret_key'], url.encode('utf-8')) return '{0}?{1}'.format(url_for('image_proxy'), urlencode(dict(url=url.encode('utf-8'), h=h)))
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({'number_of_results': int(dom.xpath('//span[@class="nums"]/text()')[0] .split(u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', ''))}) except Exception: sentry.captureException() # parse results for result in dom.xpath('//li[@class="res-list"]'): try: title = extract_text(result.xpath('.//h3')[0]) url = result.xpath('.//h3/a')[0].attrib.get('href') try: if result.xpath('.//p[@class="res-desc"]'): content = extract_text(result.xpath('.//p[@class="res-desc"]')) if result.xpath('.//div[starts-with(@class,"res-rich")]'): content = extract_text(result.xpath('.//div[starts-with(@class,"res-rich")]')) if result.xpath('.//div[@class="cont mh-pc-hover"]'): content = extract_text(result.xpath('.//div[@class="cont mh-pc-hover"]')) if result.xpath('.//div[@class="g-card g-shadow"]'): content = extract_text(result.xpath('.//div[@class="g-card g-shadow"]')) if result.xpath('.//p[@class="mh-more"]'): content = extract_text(result.xpath('.//p[@class="mh-more"]')) except Exception: content = '' sentry.captureException() # append result if 'www.so.com/link?' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + parse.quote( url) + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) try: showurl = extract_text(result.xpath(".//p[@class='res-linkinfo']/cite")) if len(showurl) == 0: showurl = url except Exception: showurl = url sentry.captureException() else: showurl = url results.append({'url': url, 'showurl': showurl, 'title': title, 'content': content}) content = '' except Exception: sentry.captureException() # return results return results
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url) if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = requests.get(url, stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format( resp.headers.get('content-type'))) return '', 400 img = b'' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, { 'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag' }) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def image_proxify(url): if url.startswith('//'): url = 'https:' + url if not request.preferences.get_value('image_proxy'): return url if settings.get('result_proxy'): return proxify(url) h = new_hmac(settings['server']['secret_key'], url.encode('utf-8')) return '{0}?{1}'.format(url_for('image_proxy'), urlencode(dict(url=url.encode('utf-8'), h=h)))
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url) if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = requests.get(url, stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) return '', 400 img = b'' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def url_proxy(): """get real url for baidu sogou and 360sousuo""" url = request.args.get('proxyurl') token = request.args.get('token') if token != new_hmac(settings['result_proxy']['key'], url.encode('utf-8')): return render('404.html'), 404 if "www.baidu.com/link?url" in url: try: resp = requests.head(url, timeout=1) except requests.exceptions.Timeout: return redirect(url) if resp.status_code == 200: realurl = resp.url else: realurl = url return redirect(realurl) else: try: resp = requests.get(url, timeout=1) except requests.exceptions.Timeout: return redirect(url) if resp.status_code == 200: if "http:" not in resp.text and "https:" not in resp.text: # try to fix response with host in window.location.replace function resp_content = resp.text.strip() count = resp_content.index("window.location.replace(") str_content = list(resp_content) # 25 is len("window.location.replace(")+1 str_content.insert(count + 25, "https://" + urlparse(url)[1]) resp_content = "".join(str_content) return resp_content else: # to get url from html response return resp.content else: return redirect(url)
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({ 'number_of_results': int( dom.xpath('//span[@class="nums_text"]/text()')[0].split( u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', '')) }) except Exception: sentry.captureException() # parse results for result in dom.xpath('//div[@class="result c-container "]'): title = extract_text(result.xpath('.//h3/a')[0]) # when search query is Chinese words try: url = result.xpath('.//div[@class="f13"]/a')[0].attrib.get('href') # To generate miji url with baidu url url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) content = extract_text( (result.xpath('.//div[@class="c-abstract"]') or result.xpath('.//div[@class="c-abstract c-abstract-en"]'))[0]) showurl = extract_text( result.xpath('.//div[@class="f13"]/a')).replace('百度快照', '') if len(showurl.strip()) == 0: showurl = re.findall(WEB_URL_REGEX, content)[0] showurl = showurl.lstrip('.') if len(showurl.strip()) == 0: showurl = url # append result results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) # when search query is English words except Exception: try: url = result.xpath('.//h3[@class="t"]/a')[0].attrib.get('href') showurl = extract_text( result.xpath('.//div[@class="f13"]/a')).replace( '百度快照', '').replace('翻译此页', '') content = extract_text( result.xpath('.//div[@class="c-span18 c-span-last"]')[0]) # To generate miji url with baidu url url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) if len(showurl.strip()) == 0: showurl = re.findall(WEB_URL_REGEX, content)[0] showurl = showurl.lstrip('.') if len(showurl.strip()) == 0: showurl = url # append result results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) except Exception: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({ 'number_of_results': int( dom.xpath('//p[@class="num-tips"]/text()')[0].split(u'\u7ea6') [1].split(u'\u6761')[0].replace(',', '')) }) except Exception: sentry.captureException() # parse results try: for result in dom.xpath('//div[@class="vrwrap"]'): try: url = result.xpath('.//a')[0].attrib.get( 'href') if result.xpath('.//a')[0].attrib.get( 'href').startswith( "http") else "https://sogou.com" + result.xpath( './/a')[0].attrib.get('href') # parse weixin.sogou html if "http://weixin.sogou.com/" == url.strip(): url = result.xpath( './/div[@class="str-pd-box str-pd-none"]//a' )[0].attrib.get('href') title = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a' )[0]) content = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]' )[0]) else: title = extract_text(result.xpath('.//h3/a')[0]) content = extract_text(result.xpath('.//div')[0]) if 'sogou.com/link?url' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) showurl = re.findall( WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0] showurl = showurl.lstrip('.') else: showurl = url # append result results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) except Exception: sentry.captureException() continue except Exception as e: sentry.captureException() try: for result in dom.xpath('//div[@class="rb"]'): try: url = result.xpath('.//a')[0].attrib.get( 'href') if result.xpath('.//a')[0].attrib.get( 'href').startswith( "http") else "https://sogou.com" + result.xpath( './/a')[0].attrib.get('href') # to parse sogou weixin html if "http://weixin.sogou.com/" == url.strip(): url = result.xpath( './/div[@class="str-pd-box str-pd-none"]//a' )[0].attrib.get('href') title = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a' )[0]) content = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]' )[0]) else: title = extract_text(result.xpath('.//h3/a')[0]) content = extract_text(result.xpath('.//div')[0]) if 'sogou.com/link?url' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) showurl = re.findall( WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0] showurl = showurl.lstrip('.') else: showurl = url results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) except Exception as e: sentry.captureException() continue except Exception as e: sentry.captureException() # return results return results