def response(resp): resp_url = urlparse(resp.url) if resp_url.path.startswith('/verify'): raise SearxEngineAccessDeniedException() results = [] dom = html.fromstring(resp.content.decode()) for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'): dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None) if dot_data is None: title_element = eval_xpath_getindex(result_element, './/h3/a', 0) results.append({ 'url': title_element.get('href'), 'title': extract_text(title_element), 'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)), }) elif dot_data == '{"reporter_name":"hint/related/relates"}': suggestions_element = eval_xpath_getindex(result_element, './div/div[@data-dot="main-box"]', 0, default=None) if suggestions_element is not None: for suggestion in eval_xpath_list(suggestions_element, './/ul/li'): results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list( dom, '//table[contains(@class, "table-list")]/tbody//tr'): href = urljoin( url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0)) title = extract_text( eval_xpath(result, './td[contains(@class, "name")]/a[2]')) seed = extract_text( eval_xpath(result, './/td[contains(@class, "seeds")]')) leech = extract_text( eval_xpath(result, './/td[contains(@class, "leeches")]')) filesize_info = extract_text( eval_xpath(result, './/td[contains(@class, "size")]/text()')) filesize, filesize_multiplier = filesize_info.split() filesize = get_torrent_size(filesize, filesize_multiplier) results.append({ 'url': href, 'title': title, 'seed': seed, 'leech': leech, 'filesize': filesize, 'template': 'torrent.html' }) return results
def response(resp): dom = html.fromstring(resp.text) search_res = dom.xpath('.//td[@class="x-item"]') if not search_res: return list() results = list() for result in search_res: url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) title = extract_text(result.xpath('.//a[@title]')) content = extract_text(result.xpath('.//div[@class="files"]')) files_data = extract_text( result.xpath('.//div[@class="tail"]')).split() filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) magnetlink = result.xpath( './/div[@class="tail"]//a[@class="title"]/@href')[0] results.append({ 'url': url, 'title': title, 'content': content, 'filesize': filesize, 'magnetlink': magnetlink, 'seed': 'N/A', 'leech': 'N/A', 'template': 'torrent.html' }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) if url is None: continue url = parse_url(url) title = eval_xpath_getindex(result, './/h3/a', 0, default=None) if title is None: continue offset = len(extract_text(title.xpath('span'))) title = extract_text(title)[offset:] content = eval_xpath_getindex( result, './/div[contains(@class, "compText")]', 0, default='' ) if content: content = extract_text(content) # append result results.append({'url': url, 'title': title, 'content': content}) for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): if resp.url.path.startswith('/verify'): raise SearxEngineAccessDeniedException() results = [] dom = html.fromstring(resp.content.decode()) for result_element in eval_xpath_list(dom, '//div[@data-dot="results"]/div'): result_data = eval_xpath_getindex(result_element, './/div[contains(@class, "bec586")]', 0, default=None) if result_data is None: continue title_element = eval_xpath_getindex(result_element, './/h3/a', 0) results.append({ 'url': title_element.get('href'), 'title': extract_text(title_element), 'content': extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')), }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) results_list = eval_xpath(dom, '//section[contains(@class, "search-results")]') for result in results_list: titles = eval_xpath(result, '//article//header//h2') contents = eval_xpath(result, '//article//p') urls = eval_xpath(result, '//header/a/@href') published_dates = eval_xpath(result, '//article/div/div/time/@datetime') for (title, content, url, published_date) in zip(titles, contents, urls, published_dates): results.append({ 'url': url, 'publishedDate': datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ'), 'title': extract_text(title), 'content': extract_text(content), }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(base_url, link.attrib.get('href')) # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... title = escape(extract_text(link)) thumbnail_tags = result.xpath(thumbnail_xpath) thumbnail = None if len(thumbnail_tags) > 0: thumbnail = extract_text(thumbnail_tags[0]) if thumbnail[0] == '/': thumbnail = base_url + thumbnail content = escape(extract_text(result.xpath(content_xpath))) # append result results.append({ 'url': href, 'title': title, 'img_src': thumbnail, 'content': content }) # return results return results
def response(resp): '''post-response callback resp: requests response object ''' results = [] tree = html.fromstring(resp.text) search_results = tree.xpath('//li[contains(@class, "searchresult")]') for result in search_results: link = result.xpath('.//div[@class="itemurl"]/a')[0] result_id = parse_qs(urlparse(link.get('href')).query)["search_item_id"][0] title = result.xpath('.//div[@class="heading"]/a/text()') date = dateparse(result.xpath('//div[@class="released"]/text()')[0].replace("released ", "")) content = result.xpath('.//div[@class="subhead"]/text()') new_result = { "url": extract_text(link), "title": extract_text(title), "content": extract_text(content), "publishedDate": date, } thumbnail = result.xpath('.//div[@class="art"]/img/@src') if thumbnail: new_result['thumbnail'] = thumbnail[0] if "album" in result.classes: new_result["embedded"] = embedded_url.format(type='album', result_id=result_id) elif "track" in result.classes: new_result["embedded"] = embedded_url.format(type='track', result_id=result_id) results.append(new_result) return results
def response(resp): results = [] doc = fromstring(resp.text) # parse results for i, r in enumerate(eval_xpath(doc, result_xpath)): if i >= 30: break try: res_url = eval_xpath(r, url_xpath)[-1] except: continue if not res_url: continue title = extract_text(eval_xpath(r, title_xpath)) content = extract_text(eval_xpath(r, content_xpath)) # append result results.append({'title': title, 'content': content, 'url': res_url}) # parse correction for correction in eval_xpath(doc, correction_xpath): # append correction results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath('//div[@class="g"]'): title = extract_text(result.xpath('.//h3')) url = result.xpath('.//div[@class="r"]/a/@href')[0] content = extract_text(result.xpath('.//span[@class="st"]')) # get thumbnails script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) ids = result.xpath('.//div[@class="s"]//img/@id') if len(ids) > 0: thumbnails_data = \ re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0], script) tmp = [] if len(thumbnails_data) != 0: tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) thumbnail = '' if len(tmp) != 0: thumbnail = tmp[-1] # append result results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html'}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) is_onion = True if 'onions' in categories else False if results_xpath: for result in eval_xpath(dom, results_xpath): url = extract_url(eval_xpath(result, url_xpath), search_url) title = extract_text(eval_xpath(result, title_xpath)) content = extract_text(eval_xpath(result, content_xpath)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = cached_url + extract_text( result.xpath(cached_xpath)) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in dom.xpath(url_xpath)), map(extract_text, dom.xpath(title_xpath)), map(extract_text, dom.xpath(content_xpath)), map(extract_text, dom.xpath(cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in dom.xpath(url_xpath)), map(extract_text, dom.xpath(title_xpath)), map(extract_text, dom.xpath(content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if not suggestion_xpath: return results for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): results = [] raise_for_httperror(resp) dom = fromstring(resp.text) word = extract_text(dom.xpath(word_xpath)) definitions = [] for dict_src in dict_xpath: for src in dom.xpath(dict_src): src_text = extract_text( src.xpath( './/span[@class="entry-head-title"]/text()')).strip() src_defs = [] for def_item in src.xpath( './/div[contains(@class, "ribbon-element")]'): if def_item.xpath('./div[@class="znacz"]'): sub_defs = [] for def_sub_item in def_item.xpath( './div[@class="znacz"]'): def_sub_text = extract_text(def_sub_item).lstrip( '0123456789. ') sub_defs.append(def_sub_text) src_defs.append((word, sub_defs)) else: def_text = extract_text(def_item).strip() def_link = def_item.xpath('./span/a/@href') if 'doroszewski' in def_link[0]: def_text = f"<a href='{def_link[0]}'>{def_text}</a>" src_defs.append((def_text, '')) definitions.append((src_text, src_defs)) if not definitions: return results infobox = '' for src in definitions: infobox += f"<div><small>{src[0]}</small>" infobox += "<ul>" for (def_text, sub_def) in src[1]: infobox += f"<li>{def_text}</li>" if sub_def: infobox += "<ol>" for sub_def_text in sub_def: infobox += f"<li>{sub_def_text}</li>" infobox += "</ol>" infobox += "</ul></div>" results.append({ 'infobox': word, 'content': infobox, }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): urls = result.xpath(url_xpath) if len(urls) != 1: continue url = sanitize_url(parse_url(extract_url(urls, search_url))) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) # still useful ? if re.match("^[0-9]+ minute(s|) ago$", publishedDate): publishedDate = datetime.now() - timedelta( minutes=int(re.match(r'\d+', publishedDate).group())) elif re.match("^[0-9]+ days? ago$", publishedDate): publishedDate = datetime.now() - timedelta( days=int(re.match(r'\d+', publishedDate).group())) elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now()\ - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) else: try: publishedDate = parser.parse(publishedDate) except: publishedDate = datetime.now() if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year) # append result results.append({ 'url': url, 'title': title, 'content': content, 'publishedDate': publishedDate }) # return results return results
def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None, only_first=True): links = [] # wiki links don't have property in wikidata page if link_type and 'wiki' in link_type: links.append(get_wikilink(result, link_type)) else: dom_element = id_cache.get(property_id, None) if dom_element is not None: if not default_label: label = extract_text(eval_xpath(dom_element, label_xpath)) label = label[0].upper() + label[1:] if link_type == 'geo': links.append(get_geolink(dom_element)) elif link_type == 'imdb': links.append(get_imdblink(dom_element, url_prefix)) else: url_results = eval_xpath(dom_element, url_xpath) for link in url_results: if link is not None: if url_prefix: link = url_prefix + extract_text(link) else: link = extract_text(link) links.append(link) # append urls for url in links: if url is not None: u = {'title': default_label or label, 'url': url} if property_id == 'P856': u['official'] = True u['domain'] = url.split('/')[2] urls.append(u) if results is not None: results.append(u) if only_first: break
def response(resp): results = [] doc = fromstring(resp.text) # parse results # Quickhits for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'): try: res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] except: continue if not res_url: continue title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) # append result results.append({ 'title': title, 'content': "", 'url': base_url + res_url }) # Search results for r in eval_xpath(doc, '//dl[@class="search_results"]/*'): try: if r.tag == "dt": res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] title = extract_text( eval_xpath(r, './/a[@class="wikilink1"]/@title')) elif r.tag == "dd": content = extract_text(eval_xpath(r, '.')) # append result results.append({ 'title': title, 'content': content, 'url': base_url + res_url }) except: continue if not res_url: continue # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath(dom, '//table[@class="result"]//td[@class="record"]'): url = eval_xpath(result, './a/@href')[0] title = extract_text(eval_xpath(result, './a//text()')) content = extract_text( eval_xpath(result, './/div[@class="text"]//text()')) results.append({'url': url, 'title': title, 'content': content}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for res in dom.xpath('//div[@class="List-item MainListing"]'): # processed start and end of link link = res.xpath('//a')[0] url = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) # TODO: get image with higher resolution img_src = thumbnail_src # append result results.append({ 'url': url, 'title': title, 'img_src': img_src, 'content': '', 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) # return results return results
def response(resp): '''post-response callback resp: requests response object ''' results = [] dom = html.fromstring(resp.text) try: number_of_results_string =\ re.sub('[^0-9]', '', eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]) results.append({'number_of_results': int(number_of_results_string)}) except: logger.debug("Couldn't read number of results.") pass for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'): try: url = eval_xpath(result, './/h2/a')[0].get('href') url = urljoin(base_url, url) title = eval_xpath(result, 'string(.//h2/a)').strip() content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, 'title': title, 'content': content}) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue return results
def response(resp): '''post-response callback resp: requests response object ''' results = [] dom = html.fromstring(resp.text) number_of_results_element =\ eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None) if number_of_results_element is not None: number_of_results_string = re.sub('[^0-9]', '', number_of_results_element) results.append({'number_of_results': int(number_of_results_string)}) for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'): url = eval_xpath_getindex(result, './/h2/a', 0).get('href') url = urljoin(base_url, url) title = eval_xpath(result, 'string(.//h2/a)').strip() content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, 'title': title, 'content': content}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in dom.xpath('//div[@class="results"]/dl'): name_cell = result.xpath('./dt')[0] title = extract_text(name_cell) # skip rows that do not contain a link to a torrent links = name_cell.xpath('./a') if len(links) != 1: continue # extract url and remove a slash in the beginning link = links[0].attrib.get('href').lstrip('/') seed = 0 leech = 0 try: seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', '')) leech = int( result.xpath('./dd/span[5]/text()')[0].replace(',', '')) except: pass params = { 'url': base_url + link, 'title': title, 'seed': seed, 'leech': leech, 'template': 'torrent.html' } # let's try to calculate the torrent size try: filesize_info = result.xpath('./dd/span[3]/text()')[0] filesize, filesize_multiplier = filesize_info.split() filesize = get_torrent_size(filesize, filesize_multiplier) params['filesize'] = filesize except: pass # does our link contain a valid SHA1 sum? if re.compile('[0-9a-fA-F]{40}').match(link): # add a magnet link to the result params['magnetlink'] = 'magnet:?xt=urn:btih:' + link # extract and convert creation date try: date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title') date = datetime.fromtimestamp(float(date_ts)) params['publishedDate'] = date except: pass results.append(params) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': href, 'title': title, 'content': content}) # return results return results
def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False): attribute = id_cache.get(property_id, None) if attribute is not None: if default_label: label = default_label else: label = extract_text(eval_xpath(attribute, label_xpath)) label = label[0].upper() + label[1:] if date: trim = True # remove calendar name calendar_name = eval_xpath(attribute, calendar_name_xpath) for calendar in calendar_name: calendar.getparent().remove(calendar) concat_values = "" values = [] first_value = None for row in eval_xpath(attribute, property_row_xpath): if not first_value or not trim or eval_xpath( row, preferred_rank_xpath): value = eval_xpath(row, value_xpath) if not value: continue value = extract_text(value) # save first value in case no ranked row is found if trim and not first_value: first_value = value else: # to avoid duplicate values if value not in values: concat_values += value + ", " values.append(value) if trim and not values: attributes.append({'label': label, 'value': first_value}) else: attributes.append({'label': label, 'value': concat_values[:-2]})
def result_to_text(url, text, htmlResult): # TODO : remove result ending with "Meaning" or "Category" dom = html.fromstring(htmlResult) a = dom.xpath('//a') if len(a) >= 1: return extract_text(a[0]) else: return text
def response(resp): results = [] result_len = 0 dom = html.fromstring(resp.text) # parse results for result in eval_xpath(dom, '//div[@class="sa_cc"]'): link = eval_xpath(result, './/h3/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, 'title': title, 'content': content}) # parse results again if nothing is found yet for result in eval_xpath(dom, '//li[@class="b_algo"]'): link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, 'title': title, 'content': content}) try: result_len_container = "".join( eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: # Remove the part "from-to" for paginated request ... result_len_container = result_len_container[ result_len_container.find("-") * 2 + 2:] result_len_container = re.sub('[^0-9]', '', result_len_container) if len(result_len_container) > 0: result_len = int(result_len_container) except Exception as e: logger.debug('result error :\n%s', e) pass if result_len and _get_offset_from_pageno( resp.search_params.get("pageno", 0)) > result_len: return [] results.append({'number_of_results': result_len}) return results
def _fetch_supported_languages(resp): # startpage's language selector is a mess # each option has a displayed name and a value, either of which may represent the language name # in the native script, the language name in English, an English transliteration of the native name, # the English name of the writing script used by the language, or occasionally something else entirely. # this cases are so special they need to be hardcoded, a couple of them are mispellings language_names = { 'english_uk': 'en-GB', 'fantizhengwen': ['zh-TW', 'zh-HK'], 'hangul': 'ko', 'malayam': 'ml', 'norsk': 'nb', 'sinhalese': 'si', 'sudanese': 'su' } # get the English name of every language known by babel language_names.update({ name.lower(): lang_code for lang_code, name in Locale('en')._data['languages'].items() }) # get the native name of every language known by babel for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()): native_name = Locale(lang_code).get_language_name().lower() # add native name exactly as it is language_names[native_name] = lang_code # add "normalized" language name (i.e. français becomes francais and español becomes espanol) unaccented_name = ''.join( filter(lambda c: not combining(c), normalize('NFKD', native_name))) if len(unaccented_name) == len(unaccented_name.encode()): # add only if result is ascii (otherwise "normalization" didn't work) language_names[unaccented_name] = lang_code dom = html.fromstring(resp.text) sp_lang_names = [] for option in dom.xpath( '//form[@id="settings-form"]//select[@name="language"]/option'): sp_lang_names.append( (option.get('value'), extract_text(option).lower())) supported_languages = {} for sp_option_value, sp_option_text in sp_lang_names: lang_code = language_names.get(sp_option_value) or language_names.get( sp_option_text) if isinstance(lang_code, str): supported_languages[lang_code] = {'alias': sp_option_value} elif isinstance(lang_code, list): for lc in lang_code: supported_languages[lc] = {'alias': sp_option_value} else: print('Unknown language option in Startpage: {} ({})'.format( sp_option_value, sp_option_text)) return supported_languages
def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//div[@class="one_result"]') # return empty array if nothing is found if not search_res: return [] # parse results for result in search_res: link = result.xpath('.//div[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0] content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False) # it is better to emit <br/> instead of |, but html tags are verboten content = content.strip().replace('\n', ' | ') content = ' '.join(content.split()) filesize = result.xpath( './/span[@class="torrent_size"]/text()')[0].split()[0] filesize_multiplier = result.xpath( './/span[@class="torrent_size"]/text()')[0].split()[1] files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0] # convert filesize to byte if possible filesize = get_torrent_size(filesize, filesize_multiplier) # convert files to int if possible try: files = int(files) except: files = None magnetlink = result.xpath( './/div[@class="torrent_magnet"]//a')[0].attrib['href'] # append result results.append({ 'url': href, 'title': title, 'content': content, 'filesize': filesize, 'files': files, 'magnetlink': magnetlink, 'template': 'torrent.html' }) # return results sorted by seeder return results
def response(resp): results = [] doc = fromstring(resp.text) # parse results for i, r in enumerate(eval_xpath(doc, result_xpath)): if i >= 30: break try: res_url = eval_xpath(r, url_xpath)[-1] except: continue if not res_url: continue title = extract_text(eval_xpath(r, title_xpath)) content = extract_text(eval_xpath(r, content_xpath)) # append result results.append({"title": title, "content": content, "url": res_url}) if eval_xpath(doc, answer_title_xpath) != []: answer_title = eval_xpath(doc, answer_title_xpath) answer_link = eval_xpath(doc, answer_link_xpath) answer_content = eval_xpath(doc, answer_content_xpath) results.append({ "answer": "stackoverflow", "data": { "title": answer_title, "link": answer_link, "content": answer_content, }, "url": answer_link, }) # parse correction for correction in eval_xpath(doc, correction_xpath): # append correction results.append({"correction": extract_text(correction)}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in eval_xpath_list( dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) if url is None: continue url = parse_url(url) title = extract_text(result.xpath('.//h4/a')) content = extract_text(result.xpath('.//p')) img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) item = { 'url': url, 'title': title, 'content': content, 'img_src': img_src } pub_date = extract_text( result.xpath('.//span[contains(@class,"s-time")]')) ago = AGO_RE.search(pub_date) if ago: number = int(ago.group(1)) delta = AGO_TIMEDELTA[ago.group(2)] pub_date = datetime.now() - delta * number else: try: pub_date = parser.parse(pub_date) except parser.ParserError: pub_date = None if pub_date is not None: item['publishedDate'] = pub_date results.append(item) for suggestion in eval_xpath_list( dom, '//div[contains(@class,"AlsoTry")]//td'): results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) for app in dom.xpath('//a[@class="package-header"]'): app_url = app.xpath('./@href')[0] app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()')) app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \ + ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip() app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0] results.append({'url': app_url, 'title': app_title, 'content': app_content, 'img_src': app_img_src}) return results
def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, href_xpath, 0) c_node = eval_xpath_getindex(result, content_xpath, 0) # <img id="vidthumb1" ...> img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) if img_id is None: continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) content = extract_text(eval_xpath(c_node, './/div[2]/span')) pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) results.append({ 'url': url, 'title': title, 'content': content, 'length': length, 'author': pub_info, 'thumbnail': img_src, 'template': 'videos.html', }) # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) return results