def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, href_xpath, 0) c_node = eval_xpath_getindex(result, content_xpath, 0) # <img id="vidthumb1" ...> img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) if img_id is None: continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) content = extract_text(eval_xpath(c_node, './/div[2]/span')) pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) results.append({ 'url': url, 'title': title, 'content': content, 'length': length, 'author': pub_info, 'thumbnail': img_src, 'template': 'videos.html', }) # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): title = extract_text(eval_xpath(result, './h3[1]//a')) if not title: # this is a [ZITATION] block continue url = eval_xpath(result, './h3[1]//a/@href')[0] content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) if pub_info: content += "[%s]" % pub_info pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) if pub_type: title = title + " " + pub_type results.append({ 'url': url, 'title': title, 'content': content, }) # parse suggestion for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): results.append({'correction': extract_text(correction)}) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath_getindex( dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): img_alt = eval_xpath_getindex(img_node, '@alt', 0) img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) url = eval_xpath_getindex(link_node, '@href', 0) pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): # The first <a> tag in the <article> contains the link to the # article The href attribute of the <a> is a google internal link, # we can't use. The real link is hidden in the jslog attribute: # # <a ... # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click" # href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen" # ... /> jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) url = re.findall('http[^;]*', jslog) if url: url = url[0] else: # The real URL is base64 encoded in the json attribute: # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" jslog = jslog.split(";")[1].split(':')[1].strip() try: padding = (4 -(len(jslog) % 4)) * "=" jslog = b64decode(jslog + padding) except binascii.Error: # URL cant be read, skip this result continue # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' url = re.findall('http[^;"]*', str(jslog))[0] # the first <h3> tag in the <article> contains the title of the link title = extract_text(eval_xpath(result, './article/h3[1]')) # the first <div> tag in the <article> contains the content of the link content = extract_text(eval_xpath(result, './article/div[1]')) # the second <div> tag contains origin publisher and the publishing date pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) pub_info = [] if pub_origin: pub_info.append(pub_origin) if pub_date: # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. pub_info.append(pub_date) pub_info = ', '.join(pub_info) if pub_info: content = pub_info + ': ' + content # The image URL is located in a preceding sibling <img> tag, e.g.: # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" # These URL are long but not personalized (double checked via tor). img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) results.append({ 'url': url, 'title': title, 'content': content, 'img_src': img_src, }) # return results return results