Exemple #1
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = fromstring(resp.text)

    for result in dom.xpath(
            '//div[@class="srch-all-result"]//li[@class="sort_lst_li"]'):
        try:
            url = 'https://v.sogou.com' + result.xpath('./a')[0].attrib.get(
                'href')
            title = result.xpath('./a')[0].attrib.get('title')
            content = title
            thumbnail = result.xpath('./a/img')[0].attrib.get('src')

            results.append({
                'url': url,
                'title': title,
                'content': content,
                'thumbnail': thumbnail,
                'template': 'videos.html'
            })
        except:
            sentry.captureException()

    return results
Exemple #2
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    for result in dom.xpath('//div[@class="dg_u"]'):
        try:
            url = (result.xpath('./div[@class="mc_vtvc"]/a/@href') or result.xpath('./div[@class="mc_vtvc mc_vtvc_fh"]/a/@href'))[0]
            #url = 'https://bing.com' + url
            title = extract_text(result.xpath('./div/a/div/div[@class="mc_vtvc_title"]/@title'))
            content = extract_text(result.xpath('./div/a/div/div/div/div/text()'))
            thumbnail = result.xpath('./div/a/div/div/img/@src')[0]

            results.append({'url': url,
                            'title': title,
                            'content': content,
                            'thumbnail': thumbnail,
                            'template': 'videos.html'})

            if len(results) >= number_of_results:
                break
        except:
            sentry.captureException()

    return results
Exemple #3
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    for result in dom.xpath('//li[@class="result"]'):
        try:
            url = result.xpath('./a')[0].attrib.get('href')
            title = result.xpath('./a')[0].attrib.get('title')
            content = title
            thumbnail = result.xpath(
                './/div[@class="view"]/img[@class="img-blur-layer"]'
            )[0].attrib.get('src')

            results.append({
                'url': url,
                'title': title,
                'content': content,
                'thumbnail': thumbnail,
                'template': 'videos.html'
            })
        except Exception:
            sentry.captureException()

        if len(results) >= imageLength:
            break

    return results
Exemple #4
0
def response(resp):
    from searx.webapp import sentry
    resultdic = loads(resp.text)
    resultlist = resultdic["list"]
    results = []

    for image in resultlist:
        try:
            url = image["link"]
            title = image["title"].replace("<em>", "").replace("</em>", "")
            thumbnail = image["thumb"]
            img_src = image["img"]
            width = image["width"]
            height = image["height"]

            # append result
            results.append({
                'template': 'images.html',
                'url': url,
                'title': title,
                'content': '',
                'thumbnail_src': thumbnail,
                'img_src': img_src,
                'width': width,
                'height': height
            })
        except Exception:
            sentry.captureException()

    # return results
    return results
Exemple #5
0
def response(resp):
    from searx.webapp import sentry
    unicode_resp = resp.text
    utf8resp = unicode_resp.encode('utf8')
    resultdic = loads(utf8resp)
    resultlist = resultdic["items"]
    results = []

    for image in resultlist:
        try:
            url = image["page_url"]
            title = image["title"]
            thumbnail = image["thumbUrl"]
            img_src = image["pic_url"]
            width = image["width"]
            height = image["height"]

            # append result
            results.append({
                'template': 'images.html',
                'url': url,
                'title': title,
                'content': '',
                'thumbnail_src': thumbnail,
                'width': width,
                'height': height,
                'img_src': img_src
            })
        except Exception:
            sentry.captureException()

    # return results
    return results
Exemple #6
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(
            '//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'):
        try:
            link = result.xpath('./a')[0]

            # TODO find actual title
            title = link.xpath('.//img/@alt')[0]

            # parse json-data (it is required to add a space, to make it parsable)
            json_data = loads(
                _quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m')))

            url = json_data.get('purl')
            img_src = json_data.get('murl')

            thumb_json_data = loads(
                _quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('mad')))
            width = int(thumb_json_data.get('max'))
            height = int(thumb_json_data.get('mah'))
            thumbnail = thumb_json_data.get('turl')

            # append result
            results.append({
                'template': 'images.html',
                'url': url,
                'width': width,
                'height': height,
                'title': title,
                'content': '',
                'thumbnail_src': thumbnail,
                'img_src': img_src
            })

            # TODO stop parsing if 10 images are found
            # if len(results) >= 10:
            #     break
        except:
            sentry.captureException()

    # return results
    return results
Exemple #7
0
def response(resp):
    from searx.webapp import sentry
    results = []
    dom = html.fromstring(resp.text)

    try:
        results.append({'number_of_results': int(dom.xpath('//span[@class="nums"]/text()')[0]
                                                 .split(u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', ''))})
    except Exception:
        sentry.captureException()

    # parse results
    for result in dom.xpath('//li[@class="res-list"]'):
        try:
            title = extract_text(result.xpath('.//h3')[0])
            url = result.xpath('.//h3/a')[0].attrib.get('href')
            try:
                if result.xpath('.//p[@class="res-desc"]'):
                    content = extract_text(result.xpath('.//p[@class="res-desc"]'))
                if result.xpath('.//div[starts-with(@class,"res-rich")]'):
                    content = extract_text(result.xpath('.//div[starts-with(@class,"res-rich")]'))
                if result.xpath('.//div[@class="cont mh-pc-hover"]'):
                    content = extract_text(result.xpath('.//div[@class="cont mh-pc-hover"]'))
                if result.xpath('.//div[@class="g-card g-shadow"]'):
                    content = extract_text(result.xpath('.//div[@class="g-card g-shadow"]'))
                if result.xpath('.//p[@class="mh-more"]'):
                    content = extract_text(result.xpath('.//p[@class="mh-more"]'))
            except Exception:
                content = ''
                sentry.captureException()

            # append result
            if 'www.so.com/link?' in url:
                url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + parse.quote(
                    url) + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                try:
                    showurl = extract_text(result.xpath(".//p[@class='res-linkinfo']/cite"))
                    if len(showurl) == 0:
                        showurl = url
                except Exception:
                    showurl = url
                    sentry.captureException()
            else:
                showurl = url
            results.append({'url': url,
                            'showurl': showurl,
                            'title': title,
                            'content': content})
            content = ''
        except Exception:
            sentry.captureException()

    # return results
    return results
Exemple #8
0
def response(resp):
    from searx.webapp import sentry
    results = []

    search_result = loads(resp.text)

    # wikipedia article's unique id
    # first valid id is assumed to be the requested article
    for article_id in search_result['query']['pages']:
        page = search_result['query']['pages'][article_id]
        if int(article_id) > 0:
            break

    if int(article_id) < 0:
        return []

    try:
        title = page.get('title')

        image = page.get('thumbnail')
        if image:
            image = image.get('source')

        extract = page.get('extract')

        summary = extract_first_paragraph(extract, title, image)

        # link to wikipedia article
        wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
            + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))

        results.append({'url': wikipedia_link, 'title': title})

        results.append({'infobox': title,
                        'id': wikipedia_link,
                        'content': summary,
                        'img_src': image,
                        'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
    except:
        sentry.captureException()

    return results
Exemple #9
0
def response(resp):
    from searx.webapp import sentry
    use_resp = resp.content
    try:
        resultdic = loads(use_resp)
    except Exception:
        resultdic = loads(
            re.sub(r'(?<!\\)\\(?!["\\/bfnrt]|u[0-9a-fA-F]{4})', r'',
                   resp.text).encode(encoding="utf-8"))
    resultlist = resultdic["data"]
    results = []

    for image in resultlist:
        try:
            url = image["replaceUrl"][0]["FromURL"]
            title = image["fromPageTitle"].replace("<strong>", "").replace(
                "</strong>", "")
            thumbnail = image["thumbURL"]
            img_src = image["thumbURL"]
            width = image["width"]
            height = image["height"]

            # append result
            results.append({
                'template': 'images.html',
                'url': url,
                'title': title,
                'content': '',
                'thumbnail_src': thumbnail,
                'width': width,
                'height': height,
                'img_src': img_src
            })
        except Exception as e:
            sentry.captureException()

    # return results
    return results
Exemple #10
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append({
            'number_of_results':
            int(
                dom.xpath('//span[@class="sb_count"]/text()')[0].split()
                [0].replace(',', ''))
        })
    except:
        sentry.captureException()

    # parse results
    for result in dom.xpath('//div[@class="sa_cc"]'):
        try:
            link = result.xpath('.//h3/a')[0]
            url = link.attrib.get('href')
            title = extract_text(link)
            content = extract_text(result.xpath('.//p'))

            # append result
            results.append({'url': url, 'title': title, 'content': content})
        except:
            sentry.captureException()

    # parse results again if nothing is found yet
    for result in dom.xpath('//li[@class="b_algo"]'):
        try:
            link = result.xpath('.//h2/a')[0]
            url = link.attrib.get('href')
            title = extract_text(link)
            content = extract_text(result.xpath('.//p'))

            # append result
            results.append({'url': url, 'title': title, 'content': content})
        except:
            sentry.captureException()

    # return results
    return results
Exemple #11
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append({
            'number_of_results':
            int(
                dom.xpath('//span[@class="nums_text"]/text()')[0].split(
                    u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', ''))
        })
    except Exception:
        sentry.captureException()

    # parse results
    for result in dom.xpath('//div[@class="result c-container "]'):
        title = extract_text(result.xpath('.//h3/a')[0])

        # when search query is Chinese words
        try:
            url = result.xpath('.//div[@class="f13"]/a')[0].attrib.get('href')
            # To generate miji url with baidu url
            url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
            content = extract_text(
                (result.xpath('.//div[@class="c-abstract"]') or
                 result.xpath('.//div[@class="c-abstract c-abstract-en"]'))[0])
            showurl = extract_text(
                result.xpath('.//div[@class="f13"]/a')).replace('百度快照', '')
            if len(showurl.strip()) == 0:
                showurl = re.findall(WEB_URL_REGEX, content)[0]
                showurl = showurl.lstrip('.')
                if len(showurl.strip()) == 0:
                    showurl = url

            # append result
            results.append({
                'url': url,
                'showurl': showurl,
                'title': title,
                'content': content
            })

        # when search query is English words
        except Exception:
            try:
                url = result.xpath('.//h3[@class="t"]/a')[0].attrib.get('href')
                showurl = extract_text(
                    result.xpath('.//div[@class="f13"]/a')).replace(
                        '百度快照', '').replace('翻译此页', '')
                content = extract_text(
                    result.xpath('.//div[@class="c-span18 c-span-last"]')[0])
                # To generate miji url with baidu url
                url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                    url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                if len(showurl.strip()) == 0:
                    showurl = re.findall(WEB_URL_REGEX, content)[0]
                    showurl = showurl.lstrip('.')
                    if len(showurl.strip()) == 0:
                        showurl = url

                # append result
                results.append({
                    'url': url,
                    'showurl': showurl,
                    'title': title,
                    'content': content
                })
            except Exception:
                sentry.captureException()

    # return results
    return results
Exemple #12
0
def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append({
            'number_of_results':
            int(
                dom.xpath('//p[@class="num-tips"]/text()')[0].split(u'\u7ea6')
                [1].split(u'\u6761')[0].replace(',', ''))
        })
    except Exception:
        sentry.captureException()

    # parse results
    try:
        for result in dom.xpath('//div[@class="vrwrap"]'):
            try:
                url = result.xpath('.//a')[0].attrib.get(
                    'href') if result.xpath('.//a')[0].attrib.get(
                        'href').startswith(
                            "http") else "https://sogou.com" + result.xpath(
                                './/a')[0].attrib.get('href')
                # parse weixin.sogou html
                if "http://weixin.sogou.com/" == url.strip():
                    url = result.xpath(
                        './/div[@class="str-pd-box str-pd-none"]//a'
                    )[0].attrib.get('href')
                    title = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a'
                        )[0])
                    content = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]'
                        )[0])
                else:
                    title = extract_text(result.xpath('.//h3/a')[0])
                    content = extract_text(result.xpath('.//div')[0])

                if 'sogou.com/link?url' in url:
                    url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                        url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                    showurl = re.findall(
                        WEB_URL_REGEX,
                        extract_text(result.xpath('.//div[@class="fb"]')))[0]
                    showurl = showurl.lstrip('.')
                else:
                    showurl = url

                # append result
                results.append({
                    'url': url,
                    'showurl': showurl,
                    'title': title,
                    'content': content
                })
            except Exception:
                sentry.captureException()
                continue

    except Exception as e:
        sentry.captureException()

    try:
        for result in dom.xpath('//div[@class="rb"]'):
            try:
                url = result.xpath('.//a')[0].attrib.get(
                    'href') if result.xpath('.//a')[0].attrib.get(
                        'href').startswith(
                            "http") else "https://sogou.com" + result.xpath(
                                './/a')[0].attrib.get('href')
                # to parse sogou weixin html
                if "http://weixin.sogou.com/" == url.strip():
                    url = result.xpath(
                        './/div[@class="str-pd-box str-pd-none"]//a'
                    )[0].attrib.get('href')
                    title = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a'
                        )[0])
                    content = extract_text(
                        result.xpath(
                            './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]'
                        )[0])
                else:
                    title = extract_text(result.xpath('.//h3/a')[0])
                    content = extract_text(result.xpath('.//div')[0])

                if 'sogou.com/link?url' in url:
                    url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                        url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                    showurl = re.findall(
                        WEB_URL_REGEX,
                        extract_text(result.xpath('.//div[@class="fb"]')))[0]
                    showurl = showurl.lstrip('.')
                else:
                    showurl = url

                results.append({
                    'url': url,
                    'showurl': showurl,
                    'title': title,
                    'content': content
                })
            except Exception as e:
                sentry.captureException()
                continue

    except Exception as e:
        sentry.captureException()

    # return results
    return results