def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ ret_val = {} thumb_name = 'vidthumb' for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): _script = script.text # var s='data:image/jpeg;base64, ...' _imgdata = _re("s='([^']*)").findall(_script) if not _imgdata: continue # var ii=['vidthumb4','vidthumb7'] for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): # At least the equal sign in the URL needs to be decoded ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): _script = script.text for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall(_script): match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) if match: # At least the equal sign in the URL needs to be decoded ret_val[match.group(1)] = match.group(2).replace( r"\u003d", "=") logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) return ret_val
def response(resp): resp_url = urlparse(resp.url) if resp_url.path.startswith('/verify'): raise SearxEngineAccessDeniedException() results = [] dom = html.fromstring(resp.content.decode()) for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'): dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None) if dot_data is None: title_element = eval_xpath_getindex(result_element, './/h3/a', 0) results.append({ 'url': title_element.get('href'), 'title': extract_text(title_element), 'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)), }) elif dot_data == '{"reporter_name":"hint/related/relates"}': suggestions_element = eval_xpath_getindex(result_element, './div/div[@data-dot="main-box"]', 0, default=None) if suggestions_element is not None: for suggestion in eval_xpath_list(suggestions_element, './/ul/li'): results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) if url is None: continue url = parse_url(url) title = eval_xpath_getindex(result, './/h3/a', 0, default=None) if title is None: continue offset = len(extract_text(title.xpath('span'))) title = extract_text(title)[offset:] content = eval_xpath_getindex( result, './/div[contains(@class, "compText")]', 0, default='' ) if content: content = extract_text(content) # append result results.append({'url': url, 'title': title, 'content': content}) for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, href_xpath, 0) c_node = eval_xpath_getindex(result, content_xpath, 0) # <img id="vidthumb1" ...> img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) if img_id is None: continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) content = extract_text(eval_xpath(c_node, './/div[2]/span')) pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) results.append({ 'url': url, 'title': title, 'content': content, 'length': length, 'author': pub_info, 'thumbnail': img_src, 'template': 'videos.html', }) # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) return results
def test_eval_xpath_list(self): doc = html.fromstring(TestXPathUtils.TEST_DOC) # check a not empty list self.assertEqual(utils.eval_xpath_list(doc, '//i/text()'), ['italic']) # check min_len parameter with self.assertRaises(SearxEngineXPathException) as context: utils.eval_xpath_list(doc, '//p', min_len=1) self.assertEqual(context.exception.message, 'len(xpath_str) < 1') self.assertEqual(context.exception.xpath_str, '//p')
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list( dom, '//table[contains(@class, "table-list")]/tbody//tr'): href = urljoin( url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0)) title = extract_text( eval_xpath(result, './td[contains(@class, "name")]/a[2]')) seed = extract_text( eval_xpath(result, './/td[contains(@class, "seeds")]')) leech = extract_text( eval_xpath(result, './/td[contains(@class, "leeches")]')) filesize_info = extract_text( eval_xpath(result, './/td[contains(@class, "size")]/text()')) filesize, filesize_multiplier = filesize_info.split() filesize = get_torrent_size(filesize, filesize_multiplier) results.append({ 'url': href, 'title': title, 'seed': seed, 'leech': leech, 'filesize': filesize, 'template': 'torrent.html' }) return results
def response(resp): '''post-response callback resp: requests response object ''' results = [] dom = html.fromstring(resp.text) number_of_results_element =\ eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None) if number_of_results_element is not None: number_of_results_string = re.sub('[^0-9]', '', number_of_results_element) results.append({'number_of_results': int(number_of_results_string)}) for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'): url = eval_xpath_getindex(result, './/h2/a', 0).get('href') url = urljoin(base_url, url) title = eval_xpath(result, 'string(.//h2/a)').strip() content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, 'title': title, 'content': content}) return results
def response(resp): if resp.url.path.startswith('/verify'): raise SearxEngineAccessDeniedException() results = [] dom = html.fromstring(resp.content.decode()) for result_element in eval_xpath_list(dom, '//div[@data-dot="results"]/div'): result_data = eval_xpath_getindex(result_element, './/div[contains(@class, "bec586")]', 0, default=None) if result_data is None: continue title_element = eval_xpath_getindex(result_element, './/h3/a', 0) results.append({ 'url': title_element.get('href'), 'title': extract_text(title_element), 'content': extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')), }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list(dom, '//div[@class="g"]'): title = extract_text(eval_xpath(result, './/h3')) url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0) content = extract_text(eval_xpath(result, './/span[@class="st"]')) # get thumbnails script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) ids = result.xpath('.//div[@class="s"]//img/@id') if len(ids) > 0: thumbnails_data = \ re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0], script) tmp = [] if len(thumbnails_data) != 0: tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) thumbnail = '' if len(tmp) != 0: thumbnail = tmp[-1] # append result results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html'}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list( dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) if url is None: continue url = parse_url(url) title = extract_text(result.xpath('.//h4/a')) content = extract_text(result.xpath('.//p')) img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) item = { 'url': url, 'title': title, 'content': content, 'img_src': img_src } pub_date = extract_text( result.xpath('.//span[contains(@class,"s-time")]')) ago = AGO_RE.search(pub_date) if ago: number = int(ago.group(1)) delta = AGO_TIMEDELTA[ago.group(2)] pub_date = datetime.now() - delta * number else: try: pub_date = parser.parse(pub_date) except parser.ParserError: pub_date = None if pub_date is not None: item['publishedDate'] = pub_date results.append(item) for suggestion in eval_xpath_list( dom, '//div[contains(@class,"AlsoTry")]//td'): results.append({'suggestion': extract_text(suggestion)}) return results
def _fetch_supported_languages(resp): supported_languages = [] dom = html.fromstring(resp.text) offset = len('lang_') for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): supported_languages.append(val[offset:]) return supported_languages
def _fetch_supported_languages(resp): ret_val = {} dom = html.fromstring(resp.text) radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') for x in radio_buttons: name = x.get("data-name") code = x.get("value").split('_')[-1] ret_val[code] = {"name": name} return ret_val
def request(query, params): response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) dom = html.fromstring(response_index.text) url_params = {'q': query} for e in eval_xpath_list(dom, '//input[@type="hidden"]'): name = e.get('name') value = e.get('value') url_params[name] = value params['url'] = base_url + '?' + urlencode(url_params) params['cookies'] = response_index.cookies return params
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, results_xpath): single_result = {'template': template} for single_field in field_definition: single_field = {**default_field_settings, **single_field} try: if single_field['single_element']: node = eval_xpath(result, single_field['xpath']) else: node = eval_xpath_list(result, single_field['xpath']) if 'extract' in single_field and single_field[ 'extract'] == 'url': value = extract_url(node, search_url) elif 'extract' in single_field and single_field[ 'extract'] == 'boolean': value = (isinstance(node, list) and len(node) > 0) elif 'extract' in single_field and single_field[ 'extract'] == 'boolean_negate': value = (isinstance(node, list) and len(node) < 1) else: value = extract_text(node) single_result[single_field['field_name']] = value except Exception as e: logger.warning('error in resolving field %s:\n%s', single_field['field_name'], e) single_result[single_field['field_name']] = unresolvable_value results.append(single_result) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): title = extract_text(eval_xpath(result, './h3[1]//a')) if not title: # this is a [ZITATION] block continue url = eval_xpath(result, './h3[1]//a/@href')[0] content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) if pub_info: content += "[%s]" % pub_info pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) if pub_type: title = title + " " + pub_type results.append({ 'url': url, 'title': title, 'content': content, }) # parse suggestion for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): results.append({'correction': extract_text(correction)}) return results
def response(resp): # get the base URL for the language in which request was made base_url = lang_urls["all"]["base"] results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list(dom, xpath_results): link = eval_xpath_getindex(result, xpath_link, 0) href = urljoin(base_url, link.attrib.get("href")) title = extract_text(link) results.append({"url": href, "title": title}) return results
def response(resp): # get the base URL for the language in which request was made language = locale_to_lang_code(resp.search_params['language']) base_url = get_lang_urls(language)['base'] results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list(dom, xpath_results): link = eval_xpath_getindex(result, xpath_link, 0) href = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) results.append({'url': href, 'title': title}) return results
def response(resp): results = [] dom = html.fromstring(resp.content) for entry in eval_xpath_list(dom, '//entry'): title = eval_xpath_getindex(entry, './/title', 0).text url = eval_xpath_getindex(entry, './/id', 0).text content_string = '{doi_content}{abstract_content}' abstract = eval_xpath_getindex(entry, './/summary', 0).text # If a doi is available, add it to the snipppet doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) doi_content = doi_element.text if doi_element is not None else '' content = content_string.format(doi_content=doi_content, abstract_content=abstract) if len(content) > 300: content = content[0:300] + "..." # TODO: center snippet on query term publishedDate = datetime.strptime( eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') res_dict = { 'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content } results.append(res_dict) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list( dom, "//div[@id='content']//div[@class='listWidget']/div/div[@class='appRow']" ): link = eval_xpath_getindex(result, './/h5/a', 0) url = base_url + link.attrib.get('href') + '#downloads' title = extract_text(link) img_src = base_url + eval_xpath_getindex(result, './/img/@src', 0) res = {'url': url, 'title': title, 'img_src': img_src} results.append(res) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, xpath_results): # defaults filesize = 0 magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" category = extract_text( eval_xpath_getindex(result, xpath_category, 0, default=[])) page_a = eval_xpath_getindex(result, xpath_title, 0) title = extract_text(page_a) href = base_url + page_a.attrib.get('href') magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) filesize_info = eval_xpath_getindex(result, xpath_filesize, 0, default=None) if filesize_info: try: filesize = filesize_info[:-2] filesize_multiplier = filesize_info[-2:] filesize = get_torrent_size(filesize, filesize_multiplier) except: pass # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime content = 'Category: "{category}".' content = content.format(category=category) results.append({ 'url': href, 'title': title, 'content': content, 'filesize': filesize, 'magnetlink': magnet_link, 'template': 'torrent.html' }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list( dom, './/div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]' ): link = eval_xpath_getindex(result, './/h5/a', 0) url = base_url + link.attrib.get('href') + '#downloads' title = extract_text(link) thumbnail_src = base_url\ + eval_xpath_getindex(result, './/img', 0).attrib.get('src').replace('&w=32&h=32', '&w=64&h=64') res = {'url': url, 'title': title, 'thumbnail_src': thumbnail_src} # append result results.append(res) # return results return results
def response(resp): results = [] xmldom = etree.fromstring(resp.content) xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0) dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div') for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'): url = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) thumbnail_src = urljoin( gallery_url, eval_xpath_getindex(link, './/img', 0).attrib['src']) # append result results.append({ 'url': url, 'title': title, 'img_src': thumbnail_src, 'content': '', 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) # return results return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath_getindex( dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): img_alt = eval_xpath_getindex(img_node, '@alt', 0) img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) url = eval_xpath_getindex(link_node, '@href', 0) pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): # The first <a> tag in the <article> contains the link to the # article The href attribute of the <a> is a google internal link, # we can't use. The real link is hidden in the jslog attribute: # # <a ... # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click" # href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen" # ... /> jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) url = re.findall('http[^;]*', jslog) if url: url = url[0] else: # The real URL is base64 encoded in the json attribute: # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" jslog = jslog.split(";")[1].split(':')[1].strip() try: padding = (4 -(len(jslog) % 4)) * "=" jslog = b64decode(jslog + padding) except binascii.Error: # URL cant be read, skip this result continue # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' url = re.findall('http[^;"]*', str(jslog))[0] # the first <h3> tag in the <article> contains the title of the link title = extract_text(eval_xpath(result, './article/h3[1]')) # the first <div> tag in the <article> contains the content of the link content = extract_text(eval_xpath(result, './article/div[1]')) # the second <div> tag contains origin publisher and the publishing date pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) pub_info = [] if pub_origin: pub_info.append(pub_origin) if pub_date: # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. pub_info.append(pub_date) pub_info = ', '.join(pub_info) if pub_info: content = pub_info + ': ' + content # The image URL is located in a preceding sibling <img> tag, e.g.: # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" # These URL are long but not personalized (double checked via tor). img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) results.append({ 'url': url, 'title': title, 'content': content, 'img_src': img_src, }) # return results return results
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0) content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): '''Scrap *results* from the response (see :ref:`engine results`). ''' results = [] dom = html.fromstring(resp.text) is_onion = 'onions' in categories # pylint: disable=undefined-variable if results_xpath: for result in eval_xpath_list(dom, results_xpath): url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) title = extract_text( eval_xpath_list(result, title_xpath, min_len=1)) content = extract_text( eval_xpath_list(result, content_xpath, min_len=1)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath_list( result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = (cached_url + extract_text( eval_xpath_list(result, cached_xpath, min_len=1))) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if suggestion_xpath: for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) logger.debug("found %s results", len(results)) return results
def response(resp): results = [] dom = html.fromstring(resp.text) is_onion = True if 'onions' in categories else False if results_xpath: for result in eval_xpath_list(dom, results_xpath): url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) title = extract_text( eval_xpath_list(result, title_xpath, min_len=1)) content = extract_text( eval_xpath_list(result, content_xpath, min_len=1)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath_list( result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = cached_url\ + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if not suggestion_xpath: return results for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): """Get response from google's search request""" detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: answer_list = [_.xpath("normalize-space()") for _ in answer_list] results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") # results --> number_of_results if not use_mobile_ui: try: _txt = eval_xpath_getindex( dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results _results_xpath = results_xpath if use_mobile_ui: _results_xpath = results_xpath_mobile_ui for result in eval_xpath_list(dom, _results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug( 'ingoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) if content is None: logger.debug( 'ingoring item from the result_xpath list: missing content of title "%s"', title) continue logger.debug('add link to results: %s', title) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results