def response(resp): results = [] dom = html.fromstring(resp.text) try: results_num = int( eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()') [0].split()[0].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in eval_xpath(dom, results_xpath): try: url = parse_url( extract_url(eval_xpath(result, url_xpath), search_url)) title = extract_text(eval_xpath(result, title_xpath)[0]) except: continue content = extract_text(eval_xpath(result, content_xpath)[0]) # append result results.append({'url': url, 'title': title, 'content': content}) # if no suggestion found, return results suggestions = eval_xpath(dom, suggestion_xpath) if not suggestions: return results # parse suggestion for suggestion in suggestions: # append suggestion results.append({'suggestion': extract_text(suggestion)}) # return results return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath_getindex( dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): img_alt = eval_xpath_getindex(img_node, '@alt', 0) img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) url = eval_xpath_getindex(link_node, '@href', 0) pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): # The first <a> tag in the <article> contains the link to the # article The href attribute of the <a> is a google internal link, # we can't use. The real link is hidden in the jslog attribute: # # <a ... # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click" # href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen" # ... /> jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) url = re.findall('http[^;]*', jslog) if url: url = url[0] else: # The real URL is base64 encoded in the json attribute: # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" jslog = jslog.split(";")[1].split(':')[1].strip() try: padding = (4 -(len(jslog) % 4)) * "=" jslog = b64decode(jslog + padding) except binascii.Error: # URL cant be read, skip this result continue # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' url = re.findall('http[^;"]*', str(jslog))[0] # the first <h3> tag in the <article> contains the title of the link title = extract_text(eval_xpath(result, './article/h3[1]')) # the first <div> tag in the <article> contains the content of the link content = extract_text(eval_xpath(result, './article/div[1]')) # the second <div> tag contains origin publisher and the publishing date pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) pub_info = [] if pub_origin: pub_info.append(pub_origin) if pub_date: # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. pub_info.append(pub_date) pub_info = ', '.join(pub_info) if pub_info: content = pub_info + ': ' + content # The image URL is located in a preceding sibling <img> tag, e.g.: # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" # These URL are long but not personalized (double checked via tor). img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) results.append({ 'url': url, 'title': title, 'content': content, 'img_src': img_src, }) # return results return results
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect": raise SearxEngineCaptchaException() if resp_url.path.startswith("/sorry"): raise SearxEngineCaptchaException() # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath( dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, "@alt")[0] img_base64_id = eval_xpath(img_node, "@data-iid") if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, "@src") if not thumbnail_src: thumbnail_src = eval_xpath(img_node, "@data-src") if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = "" link_node = eval_xpath(img_node, "../../../a[2]")[0] url = eval_xpath(link_node, "@href")[0] pub_nodes = eval_xpath(link_node, "./div/div") pub_descr = img_alt pub_source = "" if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath(img_node, "../../../@data-id")[0] src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ "url": url, "title": img_alt, "content": pub_descr, "source": pub_source, "img_src": src_url, "img_format": { "width": int(eval_xpath(img_node, "@width")[0]), "height": int(eval_xpath(img_node, "@height")[0]), }, "thumbnail_src": thumbnail_src, "template": "images.html", }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath(dom, results_xpath): links = eval_xpath(result, link_xpath) if not links: continue link = links[0] url = link.attrib.get('href') # block google-ad url's if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): continue # block startpage search url's if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue title = extract_text(link) if eval_xpath(result, content_xpath): content = extract_text(eval_xpath(result, content_xpath)) else: content = '' published_date = None # check if search result starts with something like: "2 Sep 2014 ... " if re.match( r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] # fix content string content = content[date_pos:] try: published_date = parser.parse(date_string, dayfirst=True) except ValueError: pass # check if search result starts with something like: "5 days ago ... " elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] # calculate datetime published_date = datetime.now() - timedelta( days=int(re.match(r'\d+', date_string).group())) # fix content string content = content[date_pos:] if published_date: # append result results.append({ 'url': url, 'title': title, 'content': content, 'publishedDate': published_date }) else: # append result results.append({'url': url, 'title': title, 'content': content}) # return results return results
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0) content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser): results = [] urls = [] attributes = [] title = jsonresponse.get('parse', {}).get('displaytitle', {}) result = jsonresponse.get('parse', {}).get('text', {}) if not title or not result: return results title = fromstring(title, parser=htmlparser) for elem in eval_xpath(title, language_fallback_xpath): elem.getparent().remove(elem) title = extract_text(eval_xpath(title, title_xpath)) result = fromstring(result, parser=htmlparser) for elem in eval_xpath(result, language_fallback_xpath): elem.getparent().remove(elem) description = extract_text(eval_xpath(result, description_xpath)) id_cache = get_id_cache(result) # URLS # official website add_url(urls, result, id_cache, 'P856', results=results) # wikipedia wikipedia_link_count = 0 wikipedia_link = get_wikilink(result, language + 'wiki') if wikipedia_link: wikipedia_link_count += 1 urls.append({ 'title': 'Wikipedia (' + language + ')', 'url': wikipedia_link }) if language != 'en': wikipedia_en_link = get_wikilink(result, 'enwiki') if wikipedia_en_link: wikipedia_link_count += 1 urls.append({'title': 'Wikipedia (en)', 'url': wikipedia_en_link}) # TODO: get_wiki_firstlanguage # if wikipedia_link_count == 0: # more wikis add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki') add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo') # musicbrainz add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') # IMDb add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') # source code repository add_url(urls, result, id_cache, 'P1324') # blog add_url(urls, result, id_cache, 'P1581') # social media links add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/') add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/') add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/') urls.append({ 'title': 'Wikidata', 'url': 'https://www.wikidata.org/wiki/' + wikidata_id + '?uselang=' + language }) # INFOBOX ATTRIBUTES (ROWS) # DATES # inception date add_attribute(attributes, id_cache, 'P571', date=True) # dissolution date add_attribute(attributes, id_cache, 'P576', date=True) # start date add_attribute(attributes, id_cache, 'P580', date=True) # end date add_attribute(attributes, id_cache, 'P582', date=True) # date of birth add_attribute(attributes, id_cache, 'P569', date=True) # date of death add_attribute(attributes, id_cache, 'P570', date=True) # date of spacecraft launch add_attribute(attributes, id_cache, 'P619', date=True) # date of spacecraft landing add_attribute(attributes, id_cache, 'P620', date=True) # nationality add_attribute(attributes, id_cache, 'P27') # country of origin add_attribute(attributes, id_cache, 'P495') # country add_attribute(attributes, id_cache, 'P17') # headquarters add_attribute(attributes, id_cache, 'Q180') # PLACES # capital add_attribute(attributes, id_cache, 'P36', trim=True) # head of state add_attribute(attributes, id_cache, 'P35', trim=True) # head of government add_attribute(attributes, id_cache, 'P6', trim=True) # type of government add_attribute(attributes, id_cache, 'P122') # official language add_attribute(attributes, id_cache, 'P37') # population add_attribute(attributes, id_cache, 'P1082', trim=True) # area add_attribute(attributes, id_cache, 'P2046') # currency add_attribute(attributes, id_cache, 'P38', trim=True) # heigth (building) add_attribute(attributes, id_cache, 'P2048') # MEDIA # platform (videogames) add_attribute(attributes, id_cache, 'P400') # author add_attribute(attributes, id_cache, 'P50') # creator add_attribute(attributes, id_cache, 'P170') # director add_attribute(attributes, id_cache, 'P57') # performer add_attribute(attributes, id_cache, 'P175') # developer add_attribute(attributes, id_cache, 'P178') # producer add_attribute(attributes, id_cache, 'P162') # manufacturer add_attribute(attributes, id_cache, 'P176') # screenwriter add_attribute(attributes, id_cache, 'P58') # production company add_attribute(attributes, id_cache, 'P272') # record label add_attribute(attributes, id_cache, 'P264') # publisher add_attribute(attributes, id_cache, 'P123') # original network add_attribute(attributes, id_cache, 'P449') # distributor add_attribute(attributes, id_cache, 'P750') # composer add_attribute(attributes, id_cache, 'P86') # publication date add_attribute(attributes, id_cache, 'P577', date=True) # genre add_attribute(attributes, id_cache, 'P136') # original language add_attribute(attributes, id_cache, 'P364') # isbn add_attribute(attributes, id_cache, 'Q33057') # software license add_attribute(attributes, id_cache, 'P275') # programming language add_attribute(attributes, id_cache, 'P277') # version add_attribute(attributes, id_cache, 'P348', trim=True) # narrative location add_attribute(attributes, id_cache, 'P840') # LANGUAGES # number of speakers add_attribute(attributes, id_cache, 'P1098') # writing system add_attribute(attributes, id_cache, 'P282') # regulatory body add_attribute(attributes, id_cache, 'P1018') # language code add_attribute(attributes, id_cache, 'P218') # OTHER # ceo add_attribute(attributes, id_cache, 'P169', trim=True) # founder add_attribute(attributes, id_cache, 'P112') # legal form (company/organization) add_attribute(attributes, id_cache, 'P1454') # operator add_attribute(attributes, id_cache, 'P137') # crew members (tripulation) add_attribute(attributes, id_cache, 'P1029') # taxon add_attribute(attributes, id_cache, 'P225') # chemical formula add_attribute(attributes, id_cache, 'P274') # winner (sports/contests) add_attribute(attributes, id_cache, 'P1346') # number of deaths add_attribute(attributes, id_cache, 'P1120') # currency code add_attribute(attributes, id_cache, 'P498') image = add_image(id_cache) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: results.append({ 'url': urls[0]['url'], 'title': title, 'content': description }) else: results.append({ 'infobox': title, 'id': wikipedia_link, 'content': description, 'img_src': image, 'attributes': attributes, 'urls': urls }) return results
def response(resp): '''Scrap *results* from the response (see :ref:`engine results`). ''' results = [] dom = html.fromstring(resp.text) is_onion = 'onions' in categories # pylint: disable=undefined-variable if results_xpath: for result in eval_xpath_list(dom, results_xpath): url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) title = extract_text( eval_xpath_list(result, title_xpath, min_len=1)) content = extract_text( eval_xpath_list(result, content_xpath, min_len=1)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath_list( result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = (cached_url + extract_text( eval_xpath_list(result, cached_xpath, min_len=1))) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if suggestion_xpath: for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) logger.debug("found %s results", len(results)) return results
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the ling to the origin PNG, JPG or whatever is given # (we do not blow out the link there, you could still implement that) # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, '@alt')[0] img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath(img_node, '../../../a[2]')[0] url = eval_xpath(link_node, '@href')[0] pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: results_num = int( eval_xpath( dom, '//div[@id="resultStats"]//text()')[0].split()[1].replace( ',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in eval_xpath(dom, results_xpath): try: title = extract_text(eval_xpath(result, title_xpath)[0]) url = parse_url( extract_url(eval_xpath(result, url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = eval_xpath(result, map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue # append result results.append({ 'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in eval_xpath(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def extract_text_from_dom(result, xpath): r = eval_xpath(result, xpath) if len(r) > 0: return extract_text(r[0]) return None
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0] _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath(result, title_xpath) if not title_tag: # this not one of the common google results *section* logger.debug('ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag[0]) url = eval_xpath(result, href_xpath)[0] content = extract_text_from_dom(result, content_xpath) results.append({ 'url': url, 'title': title, 'content': content }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def extract_text_from_dom(result, xpath): """returns extract_text on the first result selected by the xpath or None""" r = eval_xpath(result, xpath) if len(r) > 0: return extract_text(r[0]) return None
def response(resp): results = [] dom = html.fromstring(resp.text) is_onion = True if 'onions' in categories else False if results_xpath: for result in eval_xpath_list(dom, results_xpath): url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) title = extract_text( eval_xpath_list(result, title_xpath, min_len=1)) content = extract_text( eval_xpath_list(result, content_xpath, min_len=1)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath_list( result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = cached_url\ + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if not suggestion_xpath: return results for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) return results
def test_eval_xpath(self): doc = html.fromstring(TestXPathUtils.TEST_DOC) self.assertEqual(utils.eval_xpath(doc, '//p'), []) self.assertEqual(utils.eval_xpath(doc, '//i/text()'), ['italic']) self.assertEqual(utils.eval_xpath(doc, 'count(//i)'), 1.0)
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: answer_list = [_.xpath("normalize-space()") for _ in answer_list] results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") # results --> number_of_results if not use_mobile_ui: try: _txt = eval_xpath_getindex( dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results _results_xpath = results_xpath if use_mobile_ui: _results_xpath = results_xpath_mobile_ui for result in eval_xpath_list(dom, _results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) if content is None: logger.debug( 'ingoring item from the result_xpath list: missing content of title "%s"', title) continue logger.debug('add link to results: %s', title) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results