Beispiel #1
0
 def test_bytes(self):
     for secret_key in ['secret', b'secret', 1]:
         if secret_key == 1:
             with self.assertRaises(TypeError):
                 utils.new_hmac(secret_key, b'http://example.com')
             continue
         res = utils.new_hmac(secret_key, b'http://example.com')
         self.assertEqual(
             res,
             '23e2baa2404012a5cc8e4a18b4aabf0dde4cb9b56f679ddc0fd6d7c24339d819')
Beispiel #2
0
def image_proxify(url):

    if url.startswith('//'):
        url = 'https:' + url

    if not request.preferences.get_value('image_proxy'):
        return url

    if url.startswith('data:image/'):
        # 50 is an arbitrary number to get only the beginning of the image.
        partial_base64 = url[len('data:image/'):50].split(';')
        if len(partial_base64) == 2 \
           and partial_base64[0] in ['gif', 'png', 'jpeg', 'pjpeg', 'webp', 'tiff', 'bmp']\
           and partial_base64[1].startswith('base64,'):
            return url
        else:
            return None

    if settings.get('result_proxy'):
        return proxify(url)

    h = new_hmac(settings['server']['secret_key'], url.encode('utf-8'))

    return '{0}?{1}'.format(url_for('image_proxy'),
                            urlencode(dict(url=url.encode('utf-8'), h=h)))
Beispiel #3
0
def response(resp):
    from searx.webapp import sentry
    results = []
    dom = html.fromstring(resp.text)

    try:
        results.append({'number_of_results': int(dom.xpath('//span[@class="nums"]/text()')[0]
                                                 .split(u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', ''))})
    except Exception:
        sentry.captureException()

    # parse results
    for result in dom.xpath('//li[@class="res-list"]'):
        try:
            title = extract_text(result.xpath('.//h3')[0])
            url = result.xpath('.//h3/a')[0].attrib.get('href')
            try:
                if result.xpath('.//p[@class="res-desc"]'):
                    content = extract_text(result.xpath('.//p[@class="res-desc"]'))
                if result.xpath('.//div[starts-with(@class,"res-rich")]'):
                    content = extract_text(result.xpath('.//div[starts-with(@class,"res-rich")]'))
                if result.xpath('.//div[@class="cont mh-pc-hover"]'):
                    content = extract_text(result.xpath('.//div[@class="cont mh-pc-hover"]'))
                if result.xpath('.//div[@class="g-card g-shadow"]'):
                    content = extract_text(result.xpath('.//div[@class="g-card g-shadow"]'))
                if result.xpath('.//p[@class="mh-more"]'):
                    content = extract_text(result.xpath('.//p[@class="mh-more"]'))
            except Exception:
                content = ''
                sentry.captureException()

            # append result
            if 'www.so.com/link?' in url:
                url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + parse.quote(
                    url) + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                try:
                    showurl = extract_text(result.xpath(".//p[@class='res-linkinfo']/cite"))
                    if len(showurl) == 0:
                        showurl = url
                except Exception:
                    showurl = url
                    sentry.captureException()
            else:
                showurl = url
            results.append({'url': url,
                            'showurl': showurl,
                            'title': title,
                            'content': content})
            content = ''
        except Exception:
            sentry.captureException()

    # return results
    return results
Beispiel #4
0
def image_proxy():
    url = request.args.get('url').encode('utf-8')

    if not url:
        return '', 400

    h = new_hmac(settings['server']['secret_key'], url)

    if h != request.args.get('h'):
        return '', 400

    headers = dict_subset(request.headers,
                          {'If-Modified-Since', 'If-None-Match'})
    headers['User-Agent'] = gen_useragent()

    resp = requests.get(url,
                        stream=True,
                        timeout=settings['outgoing']['request_timeout'],
                        headers=headers,
                        proxies=outgoing_proxies)

    if resp.status_code == 304:
        return '', resp.status_code

    if resp.status_code != 200:
        logger.debug('image-proxy: wrong response code: {0}'.format(
            resp.status_code))
        if resp.status_code >= 400:
            return '', resp.status_code
        return '', 400

    if not resp.headers.get('content-type', '').startswith('image/'):
        logger.debug('image-proxy: wrong content-type: {0}'.format(
            resp.headers.get('content-type')))
        return '', 400

    img = b''
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return '', 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {
        'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'
    })

    return Response(img,
                    mimetype=resp.headers['content-type'],
                    headers=headers)
Beispiel #5
0
def image_proxify(url):

    if url.startswith('//'):
        url = 'https:' + url

    if not request.preferences.get_value('image_proxy'):
        return url

    if settings.get('result_proxy'):
        return proxify(url)

    h = new_hmac(settings['server']['secret_key'], url.encode('utf-8'))

    return '{0}?{1}'.format(url_for('image_proxy'),
                            urlencode(dict(url=url.encode('utf-8'), h=h)))
Beispiel #6
0
def image_proxify(url):

    if url.startswith('//'):
        url = 'https:' + url

    if not request.preferences.get_value('image_proxy'):
        return url

    if settings.get('result_proxy'):
        return proxify(url)

    h = new_hmac(settings['server']['secret_key'], url.encode('utf-8'))

    return '{0}?{1}'.format(url_for('image_proxy'),
                            urlencode(dict(url=url.encode('utf-8'), h=h)))
Beispiel #7
0
def image_proxy():
    url = request.args.get('url').encode('utf-8')

    if not url:
        return '', 400

    h = new_hmac(settings['server']['secret_key'], url)

    if h != request.args.get('h'):
        return '', 400

    headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
    headers['User-Agent'] = gen_useragent()

    resp = requests.get(url,
                        stream=True,
                        timeout=settings['outgoing']['request_timeout'],
                        headers=headers,
                        proxies=outgoing_proxies)

    if resp.status_code == 304:
        return '', resp.status_code

    if resp.status_code != 200:
        logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
        if resp.status_code >= 400:
            return '', resp.status_code
        return '', 400

    if not resp.headers.get('content-type', '').startswith('image/'):
        logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
        return '', 400

    img = b''
    chunk_counter = 0

    for chunk in resp.iter_content(1024 * 1024):
        chunk_counter += 1
        if chunk_counter > 5:
            return '', 502  # Bad gateway - file is too big (>5M)
        img += chunk

    headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})

    return Response(img, mimetype=resp.headers['content-type'], headers=headers)
Beispiel #8
0
def url_proxy():
    """get real url for baidu sogou and 360sousuo"""

    url = request.args.get('proxyurl')
    token = request.args.get('token')
    if token != new_hmac(settings['result_proxy']['key'], url.encode('utf-8')):
        return render('404.html'), 404

    if "www.baidu.com/link?url" in url:
        try:
            resp = requests.head(url, timeout=1)
        except requests.exceptions.Timeout:
            return redirect(url)

        if resp.status_code == 200:
            realurl = resp.url
        else:
            realurl = url
        return redirect(realurl)
    else:
        try:
            resp = requests.get(url, timeout=1)
        except requests.exceptions.Timeout:
            return redirect(url)

        if resp.status_code == 200:
            if "http:" not in resp.text and "https:" not in resp.text:
                # try to fix response with host in window.location.replace function
                resp_content = resp.text.strip()
                count = resp_content.index("window.location.replace(")
                str_content = list(resp_content)
                # 25 is len("window.location.replace(")+1
                str_content.insert(count + 25, "https://" + urlparse(url)[1])
                resp_content = "".join(str_content)
                return resp_content
            else:
                # to get url from html response
                return resp.content
        else:
            return redirect(url)
Beispiel #9
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append({
            'number_of_results':
            int(
                dom.xpath('//span[@class="nums_text"]/text()')[0].split(
                    u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', ''))
        })
    except Exception:
        sentry.captureException()

    # parse results
    for result in dom.xpath('//div[@class="result c-container "]'):
        title = extract_text(result.xpath('.//h3/a')[0])

        # when search query is Chinese words
        try:
            url = result.xpath('.//div[@class="f13"]/a')[0].attrib.get('href')
            # To generate miji url with baidu url
            url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
            content = extract_text(
                (result.xpath('.//div[@class="c-abstract"]') or
                 result.xpath('.//div[@class="c-abstract c-abstract-en"]'))[0])
            showurl = extract_text(
                result.xpath('.//div[@class="f13"]/a')).replace('百度快照', '')
            if len(showurl.strip()) == 0:
                showurl = re.findall(WEB_URL_REGEX, content)[0]
                showurl = showurl.lstrip('.')
                if len(showurl.strip()) == 0:
                    showurl = url

            # append result
            results.append({
                'url': url,
                'showurl': showurl,
                'title': title,
                'content': content
            })

        # when search query is English words
        except Exception:
            try:
                url = result.xpath('.//h3[@class="t"]/a')[0].attrib.get('href')
                showurl = extract_text(
                    result.xpath('.//div[@class="f13"]/a')).replace(
                        '百度快照', '').replace('翻译此页', '')
                content = extract_text(
                    result.xpath('.//div[@class="c-span18 c-span-last"]')[0])
                # To generate miji url with baidu url
                url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                    url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                if len(showurl.strip()) == 0:
                    showurl = re.findall(WEB_URL_REGEX, content)[0]
                    showurl = showurl.lstrip('.')
                    if len(showurl.strip()) == 0:
                        showurl = url

                # append result
                results.append({
                    'url': url,
                    'showurl': showurl,
                    'title': title,
                    'content': content
                })
            except Exception:
                sentry.captureException()

    # return results
    return results
Beispiel #10
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append({
            'number_of_results':
            int(
                dom.xpath('//p[@class="num-tips"]/text()')[0].split(u'\u7ea6')
                [1].split(u'\u6761')[0].replace(',', ''))
        })
    except Exception:
        sentry.captureException()

    # parse results
    try:
        for result in dom.xpath('//div[@class="vrwrap"]'):
            try:
                url = result.xpath('.//a')[0].attrib.get(
                    'href') if result.xpath('.//a')[0].attrib.get(
                        'href').startswith(
                            "http") else "https://sogou.com" + result.xpath(
                                './/a')[0].attrib.get('href')
                # parse weixin.sogou html
                if "http://weixin.sogou.com/" == url.strip():
                    url = result.xpath(
                        './/div[@class="str-pd-box str-pd-none"]//a'
                    )[0].attrib.get('href')
                    title = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a'
                        )[0])
                    content = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]'
                        )[0])
                else:
                    title = extract_text(result.xpath('.//h3/a')[0])
                    content = extract_text(result.xpath('.//div')[0])

                if 'sogou.com/link?url' in url:
                    url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                        url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                    showurl = re.findall(
                        WEB_URL_REGEX,
                        extract_text(result.xpath('.//div[@class="fb"]')))[0]
                    showurl = showurl.lstrip('.')
                else:
                    showurl = url

                # append result
                results.append({
                    'url': url,
                    'showurl': showurl,
                    'title': title,
                    'content': content
                })
            except Exception:
                sentry.captureException()
                continue

    except Exception as e:
        sentry.captureException()

    try:
        for result in dom.xpath('//div[@class="rb"]'):
            try:
                url = result.xpath('.//a')[0].attrib.get(
                    'href') if result.xpath('.//a')[0].attrib.get(
                        'href').startswith(
                            "http") else "https://sogou.com" + result.xpath(
                                './/a')[0].attrib.get('href')
                # to parse sogou weixin html
                if "http://weixin.sogou.com/" == url.strip():
                    url = result.xpath(
                        './/div[@class="str-pd-box str-pd-none"]//a'
                    )[0].attrib.get('href')
                    title = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a'
                        )[0])
                    content = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]'
                        )[0])
                else:
                    title = extract_text(result.xpath('.//h3/a')[0])
                    content = extract_text(result.xpath('.//div')[0])

                if 'sogou.com/link?url' in url:
                    url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                        url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                    showurl = re.findall(
                        WEB_URL_REGEX,
                        extract_text(result.xpath('.//div[@class="fb"]')))[0]
                    showurl = showurl.lstrip('.')
                else:
                    showurl = url

                results.append({
                    'url': url,
                    'showurl': showurl,
                    'title': title,
                    'content': content
                })
            except Exception as e:
                sentry.captureException()
                continue

    except Exception as e:
        sentry.captureException()

    # return results
    return results