Example #1
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    try:
        results_num = int(
            eval_xpath(dom,
                       '//div[@class="compPagination"]/span[last()]/text()')
            [0].split()[0].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in eval_xpath(dom, results_xpath):
        try:
            url = parse_url(
                extract_url(eval_xpath(result, url_xpath), search_url))
            title = extract_text(eval_xpath(result, title_xpath)[0])
        except:
            continue

        content = extract_text(eval_xpath(result, content_xpath)[0])

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    # if no suggestion found, return results
    suggestions = eval_xpath(dom, suggestion_xpath)
    if not suggestions:
        return results

    # parse suggestion
    for suggestion in suggestions:
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results
Example #2
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath_getindex(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):

        img_alt = eval_xpath_getindex(img_node, '@alt', 0)

        img_base64_id = eval_xpath(img_node, '@data-iid')
        if img_base64_id:
            img_base64_id = img_base64_id[0]
            thumbnail_src = img_bas64_map[img_base64_id]
        else:
            thumbnail_src = eval_xpath(img_node, '@src')
            if not thumbnail_src:
                thumbnail_src = eval_xpath(img_node, '@data-src')
            if thumbnail_src:
                thumbnail_src = thumbnail_src[0]
            else:
                thumbnail_src = ''

        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
        url = eval_xpath_getindex(link_node, '@href', 0)

        pub_nodes = eval_xpath(link_node, './div/div')
        pub_descr = img_alt
        pub_source = ''
        if pub_nodes:
            pub_descr = extract_text(pub_nodes[0])
            pub_source = extract_text(pub_nodes[1])

        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
        src_url = scrap_img_by_id(img_src_script, img_src_id)
        if not src_url:
            src_url = thumbnail_src

        results.append({
            'url': url,
            'title': img_alt,
            'content': pub_descr,
            'source': pub_source,
            'img_src': src_url,
            # 'img_format': img_format,
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })

    return results
Example #3
0
def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):

        # The first <a> tag in the <article> contains the link to the
        # article The href attribute of the <a> is a google internal link,
        # we can't use.  The real link is hidden in the jslog attribute:
        #
        #   <a ...
        #      jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
        #      href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
        #      ... />

        jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
        url = re.findall('http[^;]*', jslog)
        if url:
            url = url[0]
        else:
            # The real URL is base64 encoded in the json attribute:
            # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
            jslog = jslog.split(";")[1].split(':')[1].strip()
            try:
                padding = (4 -(len(jslog) % 4)) * "="
                jslog = b64decode(jslog + padding)
            except binascii.Error:
                # URL cant be read, skip this result
                continue

            # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
            url = re.findall('http[^;"]*', str(jslog))[0]

        # the first <h3> tag in the <article> contains the title of the link
        title = extract_text(eval_xpath(result, './article/h3[1]'))

        # the first <div> tag in the <article> contains the content of the link
        content = extract_text(eval_xpath(result, './article/div[1]'))

        # the second <div> tag contains origin publisher and the publishing date

        pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
        pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))

        pub_info = []
        if pub_origin:
            pub_info.append(pub_origin)
        if pub_date:
            # The pub_date is mostly a string like 'yesertday', not a real
            # timezone date or time.  Therefore we can't use publishedDate.
            pub_info.append(pub_date)
        pub_info = ', '.join(pub_info)
        if pub_info:
            content = pub_info + ': ' + content

        # The image URL is located in a preceding sibling <img> tag, e.g.:
        # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
        # These URL are long but not personalized (double checked via tor).

        img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))

        results.append({
            'url':      url,
            'title':    title,
            'content':  content,
            'img_src':  img_src,
        })

    # return results
    return results
Example #4
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect":
        raise SearxEngineCaptchaException()

    if resp_url.path.startswith("/sorry"):
        raise SearxEngineCaptchaException()

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, "@alt")[0]

            img_base64_id = eval_xpath(img_node, "@data-iid")
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, "@src")
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, "@data-src")
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ""

            link_node = eval_xpath(img_node, "../../../a[2]")[0]
            url = eval_xpath(link_node, "@href")[0]

            pub_nodes = eval_xpath(link_node, "./div/div")
            pub_descr = img_alt
            pub_source = ""
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            img_src_id = eval_xpath(img_node, "../../../@data-id")[0]
            src_url = scrap_img_by_id(img_src_script, img_src_id)
            if not src_url:
                src_url = thumbnail_src

            results.append({
                "url": url,
                "title": img_alt,
                "content": pub_descr,
                "source": pub_source,
                "img_src": src_url,
                "img_format": {
                    "width": int(eval_xpath(img_node, "@width")[0]),
                    "height": int(eval_xpath(img_node, "@height")[0]),
                },
                "thumbnail_src": thumbnail_src,
                "template": "images.html",
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
Example #5
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath(dom, results_xpath):
        links = eval_xpath(result, link_xpath)
        if not links:
            continue
        link = links[0]
        url = link.attrib.get('href')

        # block google-ad url's
        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
            continue

        # block startpage search url's
        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
            continue

        title = extract_text(link)

        if eval_xpath(result, content_xpath):
            content = extract_text(eval_xpath(result, content_xpath))
        else:
            content = ''

        published_date = None

        # check if search result starts with something like: "2 Sep 2014 ... "
        if re.match(
                r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ",
                content):
            date_pos = content.find('...') + 4
            date_string = content[0:date_pos - 5]
            # fix content string
            content = content[date_pos:]

            try:
                published_date = parser.parse(date_string, dayfirst=True)
            except ValueError:
                pass

        # check if search result starts with something like: "5 days ago ... "
        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
            date_pos = content.find('...') + 4
            date_string = content[0:date_pos - 5]

            # calculate datetime
            published_date = datetime.now() - timedelta(
                days=int(re.match(r'\d+', date_string).group()))

            # fix content string
            content = content[date_pos:]

        if published_date:
            # append result
            results.append({
                'url': url,
                'title': title,
                'content': content,
                'publishedDate': published_date
            })
        else:
            # append result
            results.append({'url': url, 'title': title, 'content': content})

    # return results
    return results
Example #6
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

        # results --> number_of_results
        try:
            _txt = eval_xpath_getindex(dom,
                                       '//div[@id="result-stats"]//text()', 0)
            _digit = ''.join([n for n in _txt if n.isdigit()])
            number_of_results = int(_digit)
            results.append({'number_of_results': number_of_results})
        except Exception as e:  # pylint: disable=broad-except
            logger.debug("did not 'number_of_results'")
            logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0)
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            results.append({'url': url, 'title': title, 'content': content})
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Example #7
0
def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser):
    results = []
    urls = []
    attributes = []

    title = jsonresponse.get('parse', {}).get('displaytitle', {})
    result = jsonresponse.get('parse', {}).get('text', {})

    if not title or not result:
        return results

    title = fromstring(title, parser=htmlparser)
    for elem in eval_xpath(title, language_fallback_xpath):
        elem.getparent().remove(elem)
    title = extract_text(eval_xpath(title, title_xpath))

    result = fromstring(result, parser=htmlparser)
    for elem in eval_xpath(result, language_fallback_xpath):
        elem.getparent().remove(elem)

    description = extract_text(eval_xpath(result, description_xpath))

    id_cache = get_id_cache(result)

    # URLS

    # official website
    add_url(urls, result, id_cache, 'P856', results=results)

    # wikipedia
    wikipedia_link_count = 0
    wikipedia_link = get_wikilink(result, language + 'wiki')
    if wikipedia_link:
        wikipedia_link_count += 1
        urls.append({
            'title': 'Wikipedia (' + language + ')',
            'url': wikipedia_link
        })

    if language != 'en':
        wikipedia_en_link = get_wikilink(result, 'enwiki')
        if wikipedia_en_link:
            wikipedia_link_count += 1
            urls.append({'title': 'Wikipedia (en)', 'url': wikipedia_en_link})

    # TODO: get_wiki_firstlanguage
    # if wikipedia_link_count == 0:

    # more wikis
    add_url(urls,
            result,
            id_cache,
            default_label='Wikivoyage (' + language + ')',
            link_type=language + 'wikivoyage')
    add_url(urls,
            result,
            id_cache,
            default_label='Wikiquote (' + language + ')',
            link_type=language + 'wikiquote')
    add_url(urls,
            result,
            id_cache,
            default_label='Wikimedia Commons',
            link_type='commonswiki')

    add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo')

    # musicbrainz
    add_url(urls, result, id_cache, 'P434', 'MusicBrainz',
            'http://musicbrainz.org/artist/')
    add_url(urls, result, id_cache, 'P435', 'MusicBrainz',
            'http://musicbrainz.org/work/')
    add_url(urls, result, id_cache, 'P436', 'MusicBrainz',
            'http://musicbrainz.org/release-group/')
    add_url(urls, result, id_cache, 'P966', 'MusicBrainz',
            'http://musicbrainz.org/label/')

    # IMDb
    add_url(urls,
            result,
            id_cache,
            'P345',
            'IMDb',
            'https://www.imdb.com/',
            link_type='imdb')
    # source code repository
    add_url(urls, result, id_cache, 'P1324')
    # blog
    add_url(urls, result, id_cache, 'P1581')
    # social media links
    add_url(urls, result, id_cache, 'P2397', 'YouTube',
            'https://www.youtube.com/channel/')
    add_url(urls, result, id_cache, 'P1651', 'YouTube',
            'https://www.youtube.com/watch?v=')
    add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/')
    add_url(urls, result, id_cache, 'P2013', 'Facebook',
            'https://facebook.com/')
    add_url(urls, result, id_cache, 'P2003', 'Instagram',
            'https://instagram.com/')

    urls.append({
        'title':
        'Wikidata',
        'url':
        'https://www.wikidata.org/wiki/' + wikidata_id + '?uselang=' + language
    })

    # INFOBOX ATTRIBUTES (ROWS)

    # DATES
    # inception date
    add_attribute(attributes, id_cache, 'P571', date=True)
    # dissolution date
    add_attribute(attributes, id_cache, 'P576', date=True)
    # start date
    add_attribute(attributes, id_cache, 'P580', date=True)
    # end date
    add_attribute(attributes, id_cache, 'P582', date=True)
    # date of birth
    add_attribute(attributes, id_cache, 'P569', date=True)
    # date of death
    add_attribute(attributes, id_cache, 'P570', date=True)
    # date of spacecraft launch
    add_attribute(attributes, id_cache, 'P619', date=True)
    # date of spacecraft landing
    add_attribute(attributes, id_cache, 'P620', date=True)

    # nationality
    add_attribute(attributes, id_cache, 'P27')
    # country of origin
    add_attribute(attributes, id_cache, 'P495')
    # country
    add_attribute(attributes, id_cache, 'P17')
    # headquarters
    add_attribute(attributes, id_cache, 'Q180')

    # PLACES
    # capital
    add_attribute(attributes, id_cache, 'P36', trim=True)
    # head of state
    add_attribute(attributes, id_cache, 'P35', trim=True)
    # head of government
    add_attribute(attributes, id_cache, 'P6', trim=True)
    # type of government
    add_attribute(attributes, id_cache, 'P122')
    # official language
    add_attribute(attributes, id_cache, 'P37')
    # population
    add_attribute(attributes, id_cache, 'P1082', trim=True)
    # area
    add_attribute(attributes, id_cache, 'P2046')
    # currency
    add_attribute(attributes, id_cache, 'P38', trim=True)
    # heigth (building)
    add_attribute(attributes, id_cache, 'P2048')

    # MEDIA
    # platform (videogames)
    add_attribute(attributes, id_cache, 'P400')
    # author
    add_attribute(attributes, id_cache, 'P50')
    # creator
    add_attribute(attributes, id_cache, 'P170')
    # director
    add_attribute(attributes, id_cache, 'P57')
    # performer
    add_attribute(attributes, id_cache, 'P175')
    # developer
    add_attribute(attributes, id_cache, 'P178')
    # producer
    add_attribute(attributes, id_cache, 'P162')
    # manufacturer
    add_attribute(attributes, id_cache, 'P176')
    # screenwriter
    add_attribute(attributes, id_cache, 'P58')
    # production company
    add_attribute(attributes, id_cache, 'P272')
    # record label
    add_attribute(attributes, id_cache, 'P264')
    # publisher
    add_attribute(attributes, id_cache, 'P123')
    # original network
    add_attribute(attributes, id_cache, 'P449')
    # distributor
    add_attribute(attributes, id_cache, 'P750')
    # composer
    add_attribute(attributes, id_cache, 'P86')
    # publication date
    add_attribute(attributes, id_cache, 'P577', date=True)
    # genre
    add_attribute(attributes, id_cache, 'P136')
    # original language
    add_attribute(attributes, id_cache, 'P364')
    # isbn
    add_attribute(attributes, id_cache, 'Q33057')
    # software license
    add_attribute(attributes, id_cache, 'P275')
    # programming language
    add_attribute(attributes, id_cache, 'P277')
    # version
    add_attribute(attributes, id_cache, 'P348', trim=True)
    # narrative location
    add_attribute(attributes, id_cache, 'P840')

    # LANGUAGES
    # number of speakers
    add_attribute(attributes, id_cache, 'P1098')
    # writing system
    add_attribute(attributes, id_cache, 'P282')
    # regulatory body
    add_attribute(attributes, id_cache, 'P1018')
    # language code
    add_attribute(attributes, id_cache, 'P218')

    # OTHER
    # ceo
    add_attribute(attributes, id_cache, 'P169', trim=True)
    # founder
    add_attribute(attributes, id_cache, 'P112')
    # legal form (company/organization)
    add_attribute(attributes, id_cache, 'P1454')
    # operator
    add_attribute(attributes, id_cache, 'P137')
    # crew members (tripulation)
    add_attribute(attributes, id_cache, 'P1029')
    # taxon
    add_attribute(attributes, id_cache, 'P225')
    # chemical formula
    add_attribute(attributes, id_cache, 'P274')
    # winner (sports/contests)
    add_attribute(attributes, id_cache, 'P1346')
    # number of deaths
    add_attribute(attributes, id_cache, 'P1120')
    # currency code
    add_attribute(attributes, id_cache, 'P498')

    image = add_image(id_cache)

    if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
        results.append({
            'url': urls[0]['url'],
            'title': title,
            'content': description
        })
    else:
        results.append({
            'infobox': title,
            'id': wikipedia_link,
            'content': description,
            'img_src': image,
            'attributes': attributes,
            'urls': urls
        })

    return results
Example #8
0
def response(resp):
    '''Scrap *results* from the response (see :ref:`engine results`).

    '''
    results = []
    dom = html.fromstring(resp.text)
    is_onion = 'onions' in categories  # pylint: disable=undefined-variable

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):

            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1),
                              search_url)
            title = extract_text(
                eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(
                eval_xpath_list(result, content_xpath, min_len=1))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(
                    result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = (cached_url + extract_text(
                    eval_xpath_list(result, cached_xpath, min_len=1)))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)

    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath)),
                    map(extract_text, eval_xpath_list(dom, cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if suggestion_xpath:
        for suggestion in eval_xpath(dom, suggestion_xpath):
            results.append({'suggestion': extract_text(suggestion)})

    logger.debug("found %s results", len(results))
    return results
Example #9
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the ling to the origin PNG, JPG or whatever is given
    #     (we do not blow out the link there, you could still implement that)
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, '@alt')[0]

            img_base64_id = eval_xpath(img_node, '@data-iid')
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, '@src')
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, '@data-src')
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ''

            link_node = eval_xpath(img_node, '../../../a[2]')[0]
            url = eval_xpath(link_node, '@href')[0]

            pub_nodes = eval_xpath(link_node, './div/div')
            pub_descr = img_alt
            pub_source = ''
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            results.append({
                'url': url,
                'title': img_alt,
                'content': pub_descr,
                'source': pub_source,
                'img_src': url,
                # 'img_format': img_format,
                'thumbnail_src': thumbnail_src,
                'template': 'images.html'
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results
Example #10
0
def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(
            eval_xpath(
                dom, '//div[@id="resultStats"]//text()')[0].split()[1].replace(
                    ',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in eval_xpath(dom, results_xpath):
        try:
            title = extract_text(eval_xpath(result, title_xpath)[0])
            url = parse_url(
                extract_url(eval_xpath(result, url_xpath), google_url),
                google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = eval_xpath(result, map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue

                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            logger.debug('result parse error in:\n%s',
                         etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in eval_xpath(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Example #11
0
def extract_text_from_dom(result, xpath):
    r = eval_xpath(result, xpath)
    if len(r) > 0:
        return extract_text(r[0])
    return None
Example #12
0
def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()')
    if answer:
        results.append({'answer': ' '.join(answer)})
    else:
        logger.debug("did not found 'answer'")

    # results --> number_of_results
    try:
        _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0]
        _digit = ''.join([n for n in _txt if n.isdigit()])
        number_of_results = int(_digit)
        results.append({'number_of_results': number_of_results})

    except Exception as e:  # pylint: disable=broad-except
        logger.debug("did not 'number_of_results'")
        logger.error(e, exc_info=True)

    # parse results
    for result in eval_xpath(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath(result, title_xpath)
            if not title_tag:
                # this not one of the common google results *section*
                logger.debug('ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag[0])
            url = eval_xpath(result, href_xpath)[0]
            content = extract_text_from_dom(result, content_xpath)
            results.append({
                'url': url,
                'title': title,
                'content': content
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(result, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    # parse suggestion
    for suggestion in eval_xpath(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Example #13
0
def extract_text_from_dom(result, xpath):
    """returns extract_text on the first result selected by the xpath or None"""
    r = eval_xpath(result, xpath)
    if len(r) > 0:
        return extract_text(r[0])
    return None
Example #14
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):
            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1),
                              search_url)
            title = extract_text(
                eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(
                eval_xpath_list(result, content_xpath, min_len=1))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(
                    result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url\
                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath)),
                    map(extract_text, eval_xpath_list(dom, cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results
Example #15
0
    def test_eval_xpath(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        self.assertEqual(utils.eval_xpath(doc, '//p'), [])
        self.assertEqual(utils.eval_xpath(doc, '//i/text()'), ['italic'])
        self.assertEqual(utils.eval_xpath(doc, 'count(//i)'), 1.0)
Example #16
0
def response(resp):
    """Get response from google's search request"""

    detect_google_sorry(resp)

    results = []

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
        answer_list = [_.xpath("normalize-space()") for _ in answer_list]
        results.append({'answer': ' '.join(answer_list)})
    else:
        logger.debug("did not find 'answer'")

        # results --> number_of_results
        if not use_mobile_ui:
            try:
                _txt = eval_xpath_getindex(
                    dom, '//div[@id="result-stats"]//text()', 0)
                _digit = ''.join([n for n in _txt if n.isdigit()])
                number_of_results = int(_digit)
                results.append({'number_of_results': number_of_results})
            except Exception as e:  # pylint: disable=broad-except
                logger.debug("did not 'number_of_results'")
                logger.error(e, exc_info=True)

    # parse results

    _results_xpath = results_xpath
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result,
                                            title_xpath,
                                            0,
                                            default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug(
                    'ingoring item from the result_xpath list: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result,
                                                       content_xpath,
                                                       0,
                                                       default=None),
                                   allow_none=True)
            if content is None:
                logger.debug(
                    'ingoring item from the result_xpath list: missing content of title "%s"',
                    title)
                continue

            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            continue

    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results