def response(resp): results = [] doc = fromstring(resp.text) # parse results for r in doc.xpath(result_xpath): try: res_url = r.xpath(url_xpath)[-1] except: continue if not res_url: continue title = html_to_text(''.join(r.xpath(title_xpath))) content = html_to_text(''.join(r.xpath(content_xpath))) # append result results.append({'title': title, 'content': content, 'url': res_url}) # return results return results
def response(resp): results = [] search_results = loads(resp.text) # return empty array if there are no results if 'data' not in search_results: return [] data = search_results.get('data', {}) res = data.get('result', {}) # parse results for result in res.get('items', {}): title = html_to_text(result['title']) res_url = result['url'] content = html_to_text(result['desc']) if category_to_keyword.get(categories[0], '') == 'web': results.append({'title': title, 'content': content, 'url': res_url}) elif category_to_keyword.get(categories[0], '') == 'images': thumbnail_src = result['thumbnail'] img_src = result['media'] results.append({'template': 'images.html', 'url': res_url, 'title': title, 'content': '', 'thumbnail_src': thumbnail_src, 'img_src': img_src}) elif category_to_keyword.get(categories[0], '') == 'social': published_date = datetime.fromtimestamp(result['date'], None) img_src = result.get('img', None) results.append({'url': res_url, 'title': title, 'publishedDate': published_date, 'content': content, 'img_src': img_src}) elif category_to_keyword.get(categories[0], '') == 'news': published_date = datetime.fromtimestamp(result['date'], None) media = result.get('media', []) if len(media) > 0: img_src = media[0].get('pict', {}).get('url', None) else: img_src = None results.append({'url': res_url, 'title': title, 'publishedDate': published_date, 'content': content, 'img_src': img_src}) return results
def response(resp): results = [] response_data = loads(resp.text) for result in response_data['results']: url = _get_url(result) title = result['e']['dn'] content = _get_content(result) results.append({ 'url': url, 'title': html_to_text(title), 'content': html_to_text(content), }) return results
def test_html_to_text(self): html = """ <a href="/testlink" class="link_access_account"> <span class="toto"> <span> <img src="test.jpg" /> </span> </span> <span class="titi"> Test text </span> </a> """ self.assertIsInstance(utils.html_to_text(html), unicode) self.assertIsNotNone(utils.html_to_text(html)) self.assertEqual(utils.html_to_text(html), "Test text")
def response(resp): results = [] search_res = loads(resp.text) # return empty array if there are no results if search_res.get('total') < 1: return [] # parse results for result in search_res['results']: if 'id' not in result: continue # is it thumbnail or img_src?? thumbnail = None if 'logo' in result: thumbnail = result['logo'] if thumbnail[0] == '/': thumbnail = url + thumbnail content = None if 'highlights' in result: content = result['highlights'][0]['value'] # append result results.append({'url': url + 'structure/' + result['id'], 'title': result['label'], # 'thumbnail': thumbnail, 'img_src': thumbnail, 'content': html_to_text(content)}) # return results return results
def response(resp): results = [] search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] for r in search_res: if not r.get('t'): continue results.append({'title': r['t'], 'content': html_to_text(r['a']), 'url': r['u']}) return results
def response(resp): result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa url_xpath = './/a[@class="large"]/@href' title_xpath = './/a[@class="large"]//text()' content_xpath = './/div[@class="snippet"]//text()' results = [] doc = fromstring(resp.text) for r in doc.xpath(result_xpath): try: res_url = r.xpath(url_xpath)[-1] except: continue if not res_url: continue title = html_to_text(''.join(r.xpath(title_xpath))) content = html_to_text(''.join(r.xpath(content_xpath))) results.append({'title': title, 'content': content, 'url': res_url}) return results
def extract_text(xpath_results): if type(xpath_results) == list: # it's list of result : concat everything using recursive call if not len(xpath_results): raise Exception('Empty url resultset') result = '' for e in xpath_results: result = result + extract_text(e) return result elif type(xpath_results) == _ElementStringResult: # it's a string return ''.join(xpath_results) else: # it's a element return html_to_text(xpath_results.text_content())
def response(resp): results = [] raw_search_results = loads(resp.text) # return empty array if there are no results if not raw_search_results: return [] search_results = raw_search_results.get('channels', []) if len(search_results) == 0: return [] for result in search_results[0].get('items', []): # parse image results if result.get('image'): result_url = '' if 'url' in result: result_url = result['url'] elif 'link' in result: result_url = result['link'] else: continue # append result results.append({'url': result_url, 'title': result['title'], 'content': '', 'img_src': result['image'], 'template': 'images.html'}) # parse general results else: publishedDate = parser.parse(result['pubDate']) # append result results.append({'url': result['link'], 'title': result['title'], 'content': html_to_text(result['description']), 'publishedDate': publishedDate}) # TODO parse video, audio and file results # return results return results
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if not request.args and not request.form: return render('index.html') try: search = Search(request) except: return render('index.html') # TODO moar refactor - do_search integration into Search class search.results, search.suggestions = do_search(search.query, request, search.engines, search.pageno, search.lang) for result in search.results: if not search.paging and engines[result['engine']].paging: search.paging = True if search.request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content(result['content'], search.query.encode('utf-8')) # noqa result['title'] = highlight_content(result['title'], search.query.encode('utf-8')) else: if 'content' in result: result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']) .strip().split()) if len(result['url']) > 74: url_parts = result['url'][:35], result['url'][-35:] result['pretty_url'] = u'{0}[...]{1}'.format(*url_parts) else: result['pretty_url'] = result['url'] for engine in result['engines']: if engine in favicons: result['favicon'] = engine # TODO, check if timezone is calculated right if 'publishedDate' in result: if result['publishedDate'].replace(tzinfo=None)\ >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result['publishedDate']\ .replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) # noqa else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: result['pubdate'] = result['publishedDate']\ .strftime('%a, %d %b %Y %H:%M:%S %z') result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query, 'results': search.results}), mimetype='application/json') elif search.request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') if search.results: csv.writerow(keys) for row in search.results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query) response.headers.add('Content-Disposition', cont_disp) return response elif search.request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=search.results, q=search.request_data['q'], number_of_results=len(search.results), base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=search.results, q=search.request_data['q'], selected_categories=search.categories, paging=search.paging, pageno=search.pageno, base_url=get_base_url(), suggestions=search.suggestions )
def index(): global categories if request.method == 'POST': request_data = request.form else: request_data = request.args if not request_data.get('q'): return render('index.html') selected_categories = [] query, selected_engines = parse_query(request_data['q'].encode('utf-8')) if not len(selected_engines): for pd_name, pd in request_data.items(): if pd_name.startswith('category_'): category = pd_name[9:] if not category in categories: continue selected_categories.append(category) if not len(selected_categories): cookie_categories = request.cookies.get('categories', '') cookie_categories = cookie_categories.split(',') for ccateg in cookie_categories: if ccateg in categories: selected_categories.append(ccateg) if not len(selected_categories): selected_categories = ['general'] for categ in selected_categories: selected_engines.extend({'category': categ, 'name': x.name} for x in categories[categ]) results, suggestions = search(query, request, selected_engines) featured_results = [] for result in results: if request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content(result['content'], query) result['title'] = highlight_content(result['title'], query) else: if 'content' in result: result['content'] = html_to_text(result['content']).strip() result['title'] = html_to_text(result['title']).strip() if len(result['url']) > 74: url_parts = result['url'][:35], result['url'][-35:] result['pretty_url'] = '{0}[...]{1}'.format(*url_parts) else: result['pretty_url'] = result['url'] for engine in result['engines']: if engine in favicons: result['favicon'] = engine if request_data.get('format') == 'json': return Response(json.dumps({'query': query, 'results': results}), mimetype='application/json') elif request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') if len(results): csv.writerow(keys) for row in results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') content_disp = 'attachment;Filename=searx_-_{0}.csv'.format(query) response.headers.add('Content-Disposition', content_disp) return response elif request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, q=request_data['q'], number_of_results=len(results), base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=results, q=request_data['q'], selected_categories=selected_categories, number_of_results=len(results) + len(featured_results), featured_results=featured_results, suggestions=suggestions )
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if not request.args and not request.form: return render('index.html', ) try: search = Search(request) except: return render('index.html', ) if plugins.call('pre_search', request, locals()): search.search(request) plugins.call('post_search', request, locals()) for result in search.results: plugins.call('on_result', request, locals()) if not search.paging and engines[result['engine']].paging: search.paging = True if search.request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content( result['content'], search.query.encode('utf-8')) # noqa result['title'] = highlight_content(result['title'], search.query.encode('utf-8')) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join( html_to_text(result['title']).strip().split()) result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: result['pubdate'] = result['publishedDate'].strftime( '%Y-%m-%d %H:%M:%S%z') if result['publishedDate'].replace( tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now( ) - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext( u'{minutes} minute(s) ago').format( minutes=minutes) # noqa else: result['publishedDate'] = gettext( u'{hours} hour(s), {minutes} minute(s) ago').format( hours=hours, minutes=minutes) # noqa else: result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': return Response(json.dumps({ 'query': search.query, 'results': search.results }), mimetype='application/json') elif search.request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') if search.results: csv.writerow(keys) for row in search.results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query) response.headers.add('Content-Disposition', cont_disp) return response elif search.request_data.get('format') == 'rss': response_rss = render('opensearch_response_rss.xml', results=search.results, q=search.request_data['q'], number_of_results=len(search.results), base_url=get_base_url()) return Response(response_rss, mimetype='text/xml') return render('results.html', results=search.results, q=search.request_data['q'], selected_categories=search.categories, paging=search.paging, pageno=search.pageno, base_url=get_base_url(), suggestions=search.suggestions, answers=search.answers, infoboxes=search.infoboxes, theme=get_current_theme_name(), favicons=global_favicons[themes.index( get_current_theme_name())])
def response(resp): results = [] search_res = json.loads(resp.text) content = '' heading = search_res.get('Heading', '') attributes = [] urls = [] infobox_id = None relatedTopics = [] # add answer if there is one answer = search_res.get('Answer', '') if answer != '': results.append({'answer': html_to_text(answer)}) # add infobox if 'Definition' in search_res: content = content + search_res.get('Definition', '') if 'Abstract' in search_res: content = content + search_res.get('Abstract', '') # image image = search_res.get('Image', '') image = None if image == '' else image # attributes if 'Infobox' in search_res: infobox = search_res.get('Infobox', None) if 'content' in infobox: for info in infobox.get('content'): attributes.append({ 'label': info.get('label'), 'value': info.get('value') }) # urls for ddg_result in search_res.get('Results', []): if 'FirstURL' in ddg_result: firstURL = ddg_result.get('FirstURL', '') text = ddg_result.get('Text', '') urls.append({'title': text, 'url': firstURL}) results.append({'title': heading, 'url': firstURL}) # related topics for ddg_result in search_res.get('RelatedTopics', []): if 'FirstURL' in ddg_result: suggestion = result_to_text(ddg_result.get('FirstURL', None), ddg_result.get('Text', None), ddg_result.get('Result', None)) if suggestion != heading: results.append({'suggestion': suggestion}) elif 'Topics' in ddg_result: suggestions = [] relatedTopics.append({ 'name': ddg_result.get('Name', ''), 'suggestions': suggestions }) for topic_result in ddg_result.get('Topics', []): suggestion = result_to_text(topic_result.get('FirstURL', None), topic_result.get('Text', None), topic_result.get('Result', None)) if suggestion != heading: suggestions.append(suggestion) # abstract abstractURL = search_res.get('AbstractURL', '') if abstractURL != '': # add as result ? problem always in english infobox_id = abstractURL urls.append({ 'title': search_res.get('AbstractSource'), 'url': abstractURL }) # definition definitionURL = search_res.get('DefinitionURL', '') if definitionURL != '': # add as result ? as answer ? problem always in english infobox_id = definitionURL urls.append({ 'title': search_res.get('DefinitionSource'), 'url': definitionURL }) # to merge with wikidata's infobox if infobox_id: infobox_id = http_regex.sub('https:', infobox_id) # entity entity = search_res.get('Entity', None) # TODO continent / country / department / location / waterfall / # mountain range : # link to map search, get weather, near by locations # TODO musician : link to music search # TODO concert tour : ?? # TODO film / actor / television / media franchise : # links to IMDB / rottentomatoes (or scrap result) # TODO music : link tu musicbrainz / last.fm # TODO book : ?? # TODO artist / playwright : ?? # TODO compagny : ?? # TODO software / os : ?? # TODO software engineer : ?? # TODO prepared food : ?? # TODO website : ?? # TODO performing art : ?? # TODO prepared food : ?? # TODO programming language : ?? # TODO file format : ?? if len(heading) > 0: # TODO get infobox.meta.value where .label='article_title' if image is None and len(attributes) == 0 and len(urls) == 1 and\ len(relatedTopics) == 0 and len(content) == 0: results.append({ 'url': urls[0]['url'], 'title': heading, 'content': content }) else: results.append({ 'infobox': heading, 'id': infobox_id, 'entity': entity, 'content': content, 'img_src': image, 'attributes': attributes, 'urls': urls, 'relatedTopics': relatedTopics }) return results
def response(resp): results = [] # According to https://www.qwant.com/js/app.js if resp.status_code == 429: raise SearxEngineCaptchaException() # raise for other errors raise_for_httperror(resp) # load JSON result search_results = loads(resp.text) # check for an API error if search_results.get('status') != 'success': raise SearxEngineAPIException('API error ' + str(search_results.get('error', ''))) # return empty array if there are no results if 'data' not in search_results: return [] data = search_results.get('data', {}) res = data.get('result', {}) # parse results for result in res.get('items', {}): title = html_to_text(result['title']) res_url = result['url'] content = html_to_text(result['desc']) if category_to_keyword.get(categories[0], '') == 'web': results.append({'title': title, 'content': content, 'url': res_url}) elif category_to_keyword.get(categories[0], '') == 'images': thumbnail_src = result['thumbnail'] img_src = result['media'] results.append({'template': 'images.html', 'url': res_url, 'title': title, 'content': '', 'thumbnail_src': thumbnail_src, 'img_src': img_src}) elif category_to_keyword.get(categories[0], '') == 'news': published_date = datetime.fromtimestamp(result['date'], None) media = result.get('media', []) if len(media) > 0: img_src = media[0].get('pict', {}).get('url', None) else: img_src = None results.append({'url': res_url, 'title': title, 'publishedDate': published_date, 'content': content, 'img_src': img_src}) return results
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if not request.args and not request.form: return render("index.html") try: search = Search(request) except: return render("index.html") if plugins.call("pre_search", request, locals()): search.search(request) plugins.call("post_search", request, locals()) for result in search.result_container.get_ordered_results(): plugins.call("on_result", request, locals()) if not search.paging and engines[result["engine"]].paging: search.paging = True if search.request_data.get("format", "html") == "html": if "content" in result: result["content"] = highlight_content(result["content"], search.query.encode("utf-8")) # noqa result["title"] = highlight_content(result["title"], search.query.encode("utf-8")) else: if result.get("content"): result["content"] = html_to_text(result["content"]).strip() # removing html content and whitespace duplications result["title"] = " ".join(html_to_text(result["title"]).strip().split()) result["pretty_url"] = prettify_url(result["url"]) # TODO, check if timezone is calculated right if "publishedDate" in result: try: # test if publishedDate >= 1900 (datetime module bug) result["pubdate"] = result["publishedDate"].strftime("%Y-%m-%d %H:%M:%S%z") except ValueError: result["publishedDate"] = None else: if result["publishedDate"].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result["publishedDate"].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result["publishedDate"] = gettext(u"{minutes} minute(s) ago").format(minutes=minutes) else: result["publishedDate"] = gettext(u"{hours} hour(s), {minutes} minute(s) ago").format( hours=hours, minutes=minutes ) # noqa else: result["publishedDate"] = format_date(result["publishedDate"]) if search.request_data.get("format") == "json": return Response( json.dumps({"query": search.query, "results": search.result_container.get_ordered_results()}), mimetype="application/json", ) elif search.request_data.get("format") == "csv": csv = UnicodeWriter(cStringIO.StringIO()) keys = ("title", "url", "content", "host", "engine", "score") csv.writerow(keys) for row in search.result_container.get_ordered_results(): row["host"] = row["parsed_url"].netloc csv.writerow([row.get(key, "") for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype="application/csv") cont_disp = "attachment;Filename=searx_-_{0}.csv".format(search.query.encode("utf-8")) response.headers.add("Content-Disposition", cont_disp) return response elif search.request_data.get("format") == "rss": response_rss = render( "opensearch_response_rss.xml", results=search.result_container.get_ordered_results(), q=search.request_data["q"], number_of_results=search.result_container.results_length(), base_url=get_base_url(), ) return Response(response_rss, mimetype="text/xml") return render( "results.html", results=search.result_container.get_ordered_results(), q=search.request_data["q"], selected_categories=search.categories, paging=search.paging, pageno=search.pageno, base_url=get_base_url(), suggestions=search.result_container.suggestions, answers=search.result_container.answers, infoboxes=search.result_container.infoboxes, theme=get_current_theme_name(), favicons=global_favicons[themes.index(get_current_theme_name())], )
def response(resp): results = [] search_res = json.loads(resp.text) # search_res.get('Entity') possible values (not exhaustive) : # * continent / country / department / location / waterfall # * actor / musician / artist # * book / performing art / film / television / media franchise / concert tour / playwright # * prepared food # * website / software / os / programming language / file format / software engineer # * compagny content = '' heading = search_res.get('Heading', '') attributes = [] urls = [] infobox_id = None relatedTopics = [] # add answer if there is one answer = search_res.get('Answer', '') if answer: logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer) if search_res.get('AnswerType') not in ['calc', 'ip']: results.append({'answer': html_to_text(answer)}) # add infobox if 'Definition' in search_res: content = content + search_res.get('Definition', '') if 'Abstract' in search_res: content = content + search_res.get('Abstract', '') # image image = search_res.get('Image') image = None if image == '' else image if image is not None and urlparse(image).netloc == '': image = urljoin('https://duckduckgo.com', image) # urls # Official website, Wikipedia page for ddg_result in search_res.get('Results', []): firstURL = ddg_result.get('FirstURL') text = ddg_result.get('Text') if firstURL is not None and text is not None: urls.append({'title': text, 'url': firstURL}) results.append({'title': heading, 'url': firstURL}) # related topics for ddg_result in search_res.get('RelatedTopics', []): if 'FirstURL' in ddg_result: firstURL = ddg_result.get('FirstURL') text = ddg_result.get('Text') if not is_broken_text(text): suggestion = result_to_text(text, ddg_result.get('Result')) if suggestion != heading and suggestion is not None: results.append({'suggestion': suggestion}) elif 'Topics' in ddg_result: suggestions = [] relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions}) for topic_result in ddg_result.get('Topics', []): suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result')) if suggestion != heading and suggestion is not None: suggestions.append(suggestion) # abstract abstractURL = search_res.get('AbstractURL', '') if abstractURL != '': # add as result ? problem always in english infobox_id = abstractURL urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True}) results.append({'url': abstractURL, 'title': heading}) # definition definitionURL = search_res.get('DefinitionURL', '') if definitionURL != '': # add as result ? as answer ? problem always in english infobox_id = definitionURL urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) # to merge with wikidata's infobox if infobox_id: infobox_id = replace_http_by_https(infobox_id) # attributes # some will be converted to urls if 'Infobox' in search_res: infobox = search_res.get('Infobox') if 'content' in infobox: osm_zoom = 17 coordinates = None for info in infobox.get('content'): data_type = info.get('data_type') data_label = info.get('label') data_value = info.get('value') # Workaround: ddg may return a double quote if data_value == '""': continue # Is it an external URL ? # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id # * netflix_id external_url = get_external_url(data_type, data_value) if external_url is not None: urls.append({'title': data_label, 'url': external_url}) elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']: # ignore instance: Wikidata value from "Instance Of" (Qxxxx) # ignore wiki_maps_trigger: reference to a javascript # ignore google_play_artist_id: service shutdown pass elif data_type == 'string' and data_label == 'Website': # There is already an URL for the website pass elif data_type == 'area': attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'}) osm_zoom = area_to_osm_zoom(data_value.get('amount')) elif data_type == 'coordinates': if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2': # coordinate on Earth # get the zoom information from the area coordinates = info else: # coordinate NOT on Earth attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'}) elif data_type == 'string': attributes.append({'label': data_label, 'value': data_value}) if coordinates: data_label = coordinates.get('label') data_value = coordinates.get('value') latitude = data_value.get('latitude') longitude = data_value.get('longitude') url = get_earth_coordinates_url(latitude, longitude, osm_zoom) urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'}) if len(heading) > 0: # TODO get infobox.meta.value where .label='article_title' if image is None and len(attributes) == 0 and len(urls) == 1 and\ len(relatedTopics) == 0 and len(content) == 0: results.append({'url': urls[0]['url'], 'title': heading, 'content': content}) else: results.append({'infobox': heading, 'id': infobox_id, 'content': content, 'img_src': image, 'attributes': attributes, 'urls': urls, 'relatedTopics': relatedTopics}) return results
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ paging = False lang = 'all' if request.cookies.get('language')\ and request.cookies['language'] in (x[0] for x in language_codes): lang = request.cookies['language'] if request.method == 'POST': request_data = request.form else: request_data = request.args if not request_data.get('q'): return render('index.html') pageno_param = request_data.get('pageno', '1') if not pageno_param.isdigit() or int(pageno_param) < 1: return render('index.html') pageno = int(pageno_param) selected_categories = [] query, selected_engines = parse_query(request_data['q'].encode('utf-8')) if len(selected_engines): selected_categories = list(set(engine['category'] for engine in selected_engines)) else: for pd_name, pd in request_data.items(): if pd_name.startswith('category_'): category = pd_name[9:] if not category in categories: continue selected_categories.append(category) if not len(selected_categories): cookie_categories = request.cookies.get('categories', '') cookie_categories = cookie_categories.split(',') for ccateg in cookie_categories: if ccateg in categories: selected_categories.append(ccateg) if not len(selected_categories): selected_categories = ['general'] for categ in selected_categories: selected_engines.extend({'category': categ, 'name': x.name} for x in categories[categ]) results, suggestions = search(query, request, selected_engines, pageno, lang) for result in results: if not paging and engines[result['engine']].paging: paging = True if request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content(result['content'], query) result['title'] = highlight_content(result['title'], query) else: if 'content' in result: result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']) .strip().split()) if len(result['url']) > 74: url_parts = result['url'][:35], result['url'][-35:] result['pretty_url'] = '{0}[...]{1}'.format(*url_parts) else: result['pretty_url'] = result['url'] for engine in result['engines']: if engine in favicons: result['favicon'] = engine if request_data.get('format') == 'json': return Response(json.dumps({'query': query, 'results': results}), mimetype='application/json') elif request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') if len(results): csv.writerow(keys) for row in results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') content_disp = 'attachment;Filename=searx_-_{0}.csv'.format(query) response.headers.add('Content-Disposition', content_disp) return response elif request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, q=request_data['q'], number_of_results=len(results), base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=results, q=request_data['q'], selected_categories=selected_categories, paging=paging, pageno=pageno, suggestions=suggestions )
def response(resp): results = [] search_results = loads(resp.text) # return empty array if there are no results if 'data' not in search_results: return [] data = search_results.get('data', {}) res = data.get('result', {}) # parse results for result in res.get('items', {}): title = html_to_text(result['title']) res_url = result['url'] content = html_to_text(result['desc']) if category_to_keyword.get(categories[0], '') == 'web': results.append({ 'title': title, 'content': content, 'url': res_url }) elif category_to_keyword.get(categories[0], '') == 'images': thumbnail_src = result['thumbnail'] img_src = result['media'] results.append({ 'template': 'images.html', 'url': res_url, 'title': title, 'content': '', 'thumbnail_src': thumbnail_src, 'img_src': img_src }) elif category_to_keyword.get(categories[0], '') == 'social': published_date = datetime.fromtimestamp(result['date'], None) img_src = result.get('img', None) results.append({ 'url': res_url, 'title': title, 'publishedDate': published_date, 'content': content, 'img_src': img_src }) elif category_to_keyword.get(categories[0], '') == 'news': published_date = datetime.fromtimestamp(result['date'], None) media = result.get('media', []) if len(media) > 0: img_src = media[0].get('pict', {}).get('url', None) else: img_src = None results.append({ 'url': res_url, 'title': title, 'publishedDate': published_date, 'content': content, 'img_src': img_src }) return results
def test_html_to_text_invalid(self): html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if not request.args and not request.form: return render('index.html') try: search = Search(request) except: return render('index.html') # TODO moar refactor - do_search integration into Search class search.results, search.suggestions = do_search(search.query, request, search.engines, search.pageno, search.lang) for result in search.results: if not search.paging and engines[result['engine']].paging: search.paging = True if search.request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content(result['content'], search.query.encode('utf-8')) # noqa result['title'] = highlight_content(result['title'], search.query.encode('utf-8')) else: if 'content' in result: result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']) .strip().split()) if len(result['url']) > 74: url_parts = result['url'][:35], result['url'][-35:] result['pretty_url'] = u'{0}[...]{1}'.format(*url_parts) else: result['pretty_url'] = result['url'] for engine in result['engines']: if engine in favicons: result['favicon'] = engine if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query, 'results': search.results}), mimetype='application/json') elif search.request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') if search.results: csv.writerow(keys) for row in search.results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query) response.headers.add('Content-Disposition', cont_disp) return response elif search.request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=search.results, q=search.request_data['q'], number_of_results=len(search.results), base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=search.results, q=search.request_data['q'], selected_categories=search.categories, paging=search.paging, pageno=search.pageno, base_url=get_base_url(), suggestions=search.suggestions )
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if not request.args and not request.form: return render( 'index.html', ) try: search = Search(request) except: return render( 'index.html', ) search.results, search.suggestions,\ search.answers, search.infoboxes = search.search(request) for result in search.results: if not search.paging and engines[result['engine']].paging: search.paging = True # check if HTTPS rewrite is required if settings['server']['https_rewrite']\ and result['parsed_url'].scheme == 'http': skip_https_rewrite = False # check if HTTPS rewrite is possible for target, rules, exclusions in https_rules: # check if target regex match with url if target.match(result['url']): # process exclusions for exclusion in exclusions: # check if exclusion match with url if exclusion.match(result['url']): skip_https_rewrite = True break # skip https rewrite if required if skip_https_rewrite: break # process rules for rule in rules: try: # TODO, precompile rule p = re.compile(rule[0]) # rewrite url if possible new_result_url = p.sub(rule[1], result['url']) except: break # parse new url new_parsed_url = urlparse(new_result_url) # continiue if nothing was rewritten if result['url'] == new_result_url: continue # get domainname from result # TODO, does only work correct with TLD's like # asdf.com, not for asdf.com.de # TODO, using publicsuffix instead of this rewrite rule old_result_domainname = '.'.join( result['parsed_url'].hostname.split('.')[-2:]) new_result_domainname = '.'.join( new_parsed_url.hostname.split('.')[-2:]) # check if rewritten hostname is the same, # to protect against wrong or malicious rewrite rules if old_result_domainname == new_result_domainname: # set new url result['url'] = new_result_url # target has matched, do not search over the other rules break if search.request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content(result['content'], search.query.encode('utf-8')) # noqa result['title'] = highlight_content(result['title'], search.query.encode('utf-8')) else: if 'content' in result: result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']) .strip().split()) if len(result['url']) > 74: url_parts = result['url'][:35], result['url'][-35:] result['pretty_url'] = u'{0}[...]{1}'.format(*url_parts) else: result['pretty_url'] = result['url'] for engine in result['engines']: if engine in favicons: result['favicon'] = engine # TODO, check if timezone is calculated right if 'publishedDate' in result: if result['publishedDate'].replace(tzinfo=None)\ >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result['publishedDate']\ .replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) # noqa else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: result['pubdate'] = result['publishedDate']\ .strftime('%a, %d %b %Y %H:%M:%S %z') result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query, 'results': search.results}), mimetype='application/json') elif search.request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') if search.results: csv.writerow(keys) for row in search.results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query) response.headers.add('Content-Disposition', cont_disp) return response elif search.request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=search.results, q=search.request_data['q'], number_of_results=len(search.results), base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=search.results, q=search.request_data['q'], selected_categories=search.categories, paging=search.paging, pageno=search.pageno, base_url=get_base_url(), suggestions=search.suggestions, answers=search.answers, infoboxes=search.infoboxes, theme=get_current_theme_name() )
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if not request.args and not request.form: return render( 'index.html', ) try: search = Search(request) except: return render( 'index.html', ) if plugins.call('pre_search', request, locals()): search.search(request) plugins.call('post_search', request, locals()) for result in search.result_container.get_ordered_results(): plugins.call('on_result', request, locals()) if not search.paging and engines[result['engine']].paging: search.paging = True if search.request_data.get('format', 'html') == 'html': if 'content' in result: result['content'] = highlight_content(result['content'], search.query.encode('utf-8')) # noqa result['title'] = highlight_content(result['title'], search.query.encode('utf-8')) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']).strip().split()) result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query, 'results': search.result_container.get_ordered_results()}), mimetype='application/json') elif search.request_data.get('format') == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') csv.writerow(keys) for row in search.result_container.get_ordered_results(): row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query) response.headers.add('Content-Disposition', cont_disp) return response elif search.request_data.get('format') == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=search.result_container.get_ordered_results(), q=search.request_data['q'], number_of_results=search.result_container.results_length(), base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=search.result_container.get_ordered_results(), q=search.request_data['q'], selected_categories=search.categories, paging=search.paging, pageno=search.pageno, base_url=get_base_url(), suggestions=search.result_container.suggestions, answers=search.result_container.answers, infoboxes=search.result_container.infoboxes, theme=get_current_theme_name(), favicons=global_favicons[themes.index(get_current_theme_name())] )
def search(self, task): global number_of_searches # init vars requests = [] results_queue = Queue() results = {} # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if self.lang != 'all' and not engine.language_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = self.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = self.lang # try: # 0 = None, 1 = Moderate, 2 = Strict # request_params['safesearch'] = int(request.cookies.get('safesearch')) # except Exception: request_params['safesearch'] = settings['search']['safe_search'] # update request parameters dependent on # search-engine (contained in engines folder) engine.request(task['query'].encode('utf-8'), request_params) # update request parameters dependent on # search-engine (contained in engines folder) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], results_queue, engine.response, request_params) # create dictionary which contain all # informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify'] ) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return self # send all search-request threaded_requests(requests) while not results_queue.empty(): engine_name, engine_results = results_queue.get_nowait() # TODO type checks [self.suggestions.append(x['suggestion']) for x in list(engine_results) if 'suggestion' in x and engine_results.remove(x) is None] [self.answers.append(x['answer']) for x in list(engine_results) if 'answer' in x and engine_results.remove(x) is None] self.infoboxes.extend(x for x in list(engine_results) if 'infobox' in x and engine_results.remove(x) is None) results[engine_name] = engine_results # update engine-specific stats for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) # score results and remove duplications self.results = score_results(results) # merge infoboxes according to their ids self.infoboxes = merge_infoboxes(self.infoboxes) # update engine stats, using calculated score for result in self.results: plugins.callAPI('on_result', self.plugins, locals()) for res_engine in result['engines']: engines[result['engine']] \ .stats['score_count'] += result['score'] result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') if not self.paging and engines[result['engine']].paging: self.paging = True if 'content' in result: result['content_html'] = highlight_content(result['content'], self.query.encode('utf-8')) # noqa result['title_html'] = highlight_content(result['title'], self.query.encode('utf-8')) if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']).strip().split()) # return results, suggestions, answers and infoboxes return self
def response(resp): results = [] matches = modelexport_re.search(resp.text) if matches is None: return results match = matches.group(1) model_export = loads(match) if 'legend' not in model_export: return results legend = model_export['legend'] # handle empty page if not legend or not legend[0]: return results for index in legend: photo = model_export['main'][index[0]][int( index[1])][index[2]][index[3]][int(index[4])] author = ecma_unescape(photo.get('realname', '')) source = ecma_unescape(photo.get('username', '')) + ' @ Flickr' title = ecma_unescape(photo.get('title', '')) content = html_to_text(ecma_unescape(photo.get('description', ''))) img_src = None # From the biggest to the lowest format for image_size in image_sizes: if image_size in photo['sizes']: img_src = photo['sizes'][image_size]['url'] img_format = 'jpg ' \ + str(photo['sizes'][image_size]['width']) \ + 'x' \ + str(photo['sizes'][image_size]['height']) break if not img_src: logger.debug('cannot find valid image size: {0}'.format( repr(photo))) continue # For a bigger thumbnail, keep only the url_z, not the url_n if 'n' in photo['sizes']: thumbnail_src = photo['sizes']['n']['url'] elif 'z' in photo['sizes']: thumbnail_src = photo['sizes']['z']['url'] else: thumbnail_src = img_src if 'ownerNsid' not in photo: # should not happen, disowned photo? Show it anyway url = img_src else: url = build_flickr_url(photo['ownerNsid'], photo['id']) result = { 'url': url, 'img_src': img_src, 'thumbnail_src': thumbnail_src, 'source': source, 'img_format': img_format, 'template': 'images.html' } try: result['author'] = author result['title'] = title result['content'] = content except: result['author'] = '' result['title'] = '' result['content'] = '' results.append(result) return results
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ # output_format output_format = request.form.get('format', 'html') if output_format not in ['html', 'csv', 'json', 'rss']: output_format = 'html' # check if there is query if request.form.get('q') is None: if output_format == 'html': return render('index.html', ) else: return index_error(output_format, 'No query'), 400 # search search_query = None result_container = None try: search_query = get_search_query_from_webapp(request.preferences, request.form) # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request.user_plugins, request) result_container = search.search() except Exception as e: # log exception logger.exception('search error') # is it an invalid input parameter or something else ? if (issubclass(e.__class__, SearxParameterException)): return index_error(output_format, e.message), 400 else: return index_error(output_format, gettext('search error')), 500 # results results = result_container.get_ordered_results() number_of_results = result_container.results_number() if number_of_results < result_container.results_length(): number_of_results = 0 # UI advanced_search = request.form.get('advanced_search', None) # output for result in results: if output_format == 'html': if 'content' in result and result['content']: result['content'] = highlight_content( escape(result['content'][:1024]), search_query.query) result['title'] = highlight_content(escape(result['title'] or u''), search_query.query) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join( html_to_text(result['title']).strip().split()) result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: try: # test if publishedDate >= 1900 (datetime module bug) result['pubdate'] = result['publishedDate'].strftime( '%Y-%m-%d %H:%M:%S%z') except ValueError: result['publishedDate'] = None else: if result['publishedDate'].replace( tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now( ) - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext( u'{minutes} minute(s) ago').format(minutes=minutes) else: result['publishedDate'] = gettext( u'{hours} hour(s), {minutes} minute(s) ago' ).format(hours=hours, minutes=minutes) # noqa else: result['publishedDate'] = format_date( result['publishedDate']) if output_format == 'json': return Response(json.dumps( { 'query': search_query.query.decode('utf-8'), 'number_of_results': number_of_results, 'results': results, 'answers': list(result_container.answers), 'corrections': list(result_container.corrections), 'infoboxes': result_container.infoboxes, 'suggestions': list(result_container.suggestions), 'unresponsive_engines': list( result_container.unresponsive_engines) }, default=lambda item: list(item) if isinstance(item, set) else item), mimetype='application/json') elif output_format == 'csv': csv = UnicodeWriter(StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') csv.writerow(keys) for row in results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format( search_query.query) response.headers.add('Content-Disposition', cont_disp) return response elif output_format == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, q=request.form['q'], number_of_results=number_of_results, base_url=get_base_url(), override_theme='__common__', ) return Response(response_rss, mimetype='text/xml') return render('results.html', results=results, q=request.form['q'], selected_categories=search_query.categories, pageno=search_query.pageno, time_range=search_query.time_range, number_of_results=format_decimal(number_of_results), advanced_search=advanced_search, suggestions=result_container.suggestions, answers=result_container.answers, corrections=result_container.corrections, infoboxes=result_container.infoboxes, paging=result_container.paging, unresponsive_engines=result_container.unresponsive_engines, current_language=match_language( search_query.lang, LANGUAGE_CODES, fallback=settings['search']['language']), base_url=get_base_url(), theme=get_current_theme_name(), favicons=global_favicons[themes.index( get_current_theme_name())])
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if request.form.get("q") is None: return render("index.html") # search search_query = None result_container = None try: search_query = get_search_query_from_webapp(request.preferences, request.form) # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request) result_container = search.search() except: request.errors.append(gettext("search error")) logger.exception("search error") return render("index.html") results = result_container.get_ordered_results() # UI advanced_search = request.form.get("advanced_search", None) output_format = request.form.get("format", "html") if output_format not in ["html", "csv", "json", "rss"]: output_format = "html" # output for result in results: if output_format == "html": if "content" in result and result["content"]: result["content"] = highlight_content( escape(result["content"][:1024]), search_query.query.encode("utf-8") ) result["title"] = highlight_content(escape(result["title"] or u""), search_query.query.encode("utf-8")) else: if result.get("content"): result["content"] = html_to_text(result["content"]).strip() # removing html content and whitespace duplications result["title"] = " ".join(html_to_text(result["title"]).strip().split()) result["pretty_url"] = prettify_url(result["url"]) # TODO, check if timezone is calculated right if "publishedDate" in result: try: # test if publishedDate >= 1900 (datetime module bug) result["pubdate"] = result["publishedDate"].strftime("%Y-%m-%d %H:%M:%S%z") except ValueError: result["publishedDate"] = None else: if result["publishedDate"].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result["publishedDate"].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result["publishedDate"] = gettext(u"{minutes} minute(s) ago").format(minutes=minutes) else: result["publishedDate"] = gettext(u"{hours} hour(s), {minutes} minute(s) ago").format( hours=hours, minutes=minutes ) # noqa else: result["publishedDate"] = format_date(result["publishedDate"]) number_of_results = result_container.results_number() if number_of_results < result_container.results_length(): number_of_results = 0 if output_format == "json": return Response( json.dumps( { "query": search_query.query, "number_of_results": number_of_results, "results": results, "answers": list(result_container.answers), "infoboxes": result_container.infoboxes, "suggestions": list(result_container.suggestions), } ), mimetype="application/json", ) elif output_format == "csv": csv = UnicodeWriter(cStringIO.StringIO()) keys = ("title", "url", "content", "host", "engine", "score") csv.writerow(keys) for row in results: row["host"] = row["parsed_url"].netloc csv.writerow([row.get(key, "") for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype="application/csv") cont_disp = "attachment;Filename=searx_-_{0}.csv".format(search_query.query.encode("utf-8")) response.headers.add("Content-Disposition", cont_disp) return response elif output_format == "rss": response_rss = render( "opensearch_response_rss.xml", results=results, q=request.form["q"], number_of_results=number_of_results, base_url=get_base_url(), ) return Response(response_rss, mimetype="text/xml") return render( "results.html", results=results, q=request.form["q"], selected_categories=search_query.categories, pageno=search_query.pageno, time_range=search_query.time_range, number_of_results=format_decimal(number_of_results), advanced_search=advanced_search, suggestions=result_container.suggestions, answers=result_container.answers, infoboxes=result_container.infoboxes, paging=result_container.paging, base_url=get_base_url(), theme=get_current_theme_name(), favicons=global_favicons[themes.index(get_current_theme_name())], )
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ # output_format output_format = request.form.get('format', 'html') if output_format not in ['html', 'csv', 'json', 'rss']: output_format = 'html' # check if there is query if request.form.get('q') is None: if output_format == 'html': return render( 'index.html', ) else: return index_error(output_format, 'No query'), 400 # search search_query = None result_container = None try: search_query = get_search_query_from_webapp(request.preferences, request.form) # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request.user_plugins, request) result_container = search.search() except Exception as e: # log exception logger.exception('search error') # is it an invalid input parameter or something else ? if (issubclass(e.__class__, SearxParameterException)): return index_error(output_format, e.message), 400 else: return index_error(output_format, gettext('search error')), 500 # results results = result_container.get_ordered_results() number_of_results = result_container.results_number() if number_of_results < result_container.results_length(): number_of_results = 0 # UI advanced_search = request.form.get('advanced_search', None) # output for result in results: if output_format == 'html': if 'content' in result and result['content']: result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query) result['title'] = highlight_content(escape(result['title'] or u''), search_query.query) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']).strip().split()) result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: try: # test if publishedDate >= 1900 (datetime module bug) result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') except ValueError: result['publishedDate'] = None else: if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: result['publishedDate'] = format_date(result['publishedDate']) if output_format == 'json': return Response(json.dumps({'query': search_query.query.decode('utf-8'), 'number_of_results': number_of_results, 'results': results, 'answers': list(result_container.answers), 'corrections': list(result_container.corrections), 'infoboxes': result_container.infoboxes, 'suggestions': list(result_container.suggestions), 'unresponsive_engines': list(result_container.unresponsive_engines)}, default=lambda item: list(item) if isinstance(item, set) else item), mimetype='application/json') elif output_format == 'csv': csv = UnicodeWriter(StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') csv.writerow(keys) for row in results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query) response.headers.add('Content-Disposition', cont_disp) return response elif output_format == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, q=request.form['q'], number_of_results=number_of_results, base_url=get_base_url(), override_theme='__common__', ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=results, q=request.form['q'], selected_categories=search_query.categories, pageno=search_query.pageno, time_range=search_query.time_range, number_of_results=format_decimal(number_of_results), advanced_search=advanced_search, suggestions=result_container.suggestions, answers=result_container.answers, corrections=result_container.corrections, infoboxes=result_container.infoboxes, paging=result_container.paging, unresponsive_engines=result_container.unresponsive_engines, current_language=match_language(search_query.lang, LANGUAGE_CODES, fallback=settings['search']['language']), base_url=get_base_url(), theme=get_current_theme_name(), favicons=global_favicons[themes.index(get_current_theme_name())] )
def search(): """Search query in q and return results. Supported outputs: html, json, csv, rss. """ # output_format output_format = request.form.get('format', 'html') if output_format not in ['html', 'csv', 'json', 'rss']: output_format = 'html' # check if there is query (not None and not an empty string) if not request.form.get('q'): if output_format == 'html': return render( 'index.html', advanced_search=request.preferences.get_value( 'advanced_search'), selected_categories=get_selected_categories( request.preferences, request.form), ) else: return index_error(output_format, 'No query'), 400 # search search_query = None raw_text_query = None result_container = None try: search_query, raw_text_query, _, _ = get_search_query_from_webapp( request.preferences, request.form) # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request.user_plugins, request) result_container = search.search() except SearxParameterException as e: logger.exception('search error: SearxParameterException') return index_error(output_format, e.message), 400 except Exception as e: logger.exception('search error') return index_error(output_format, gettext('search error')), 500 # results results = result_container.get_ordered_results() number_of_results = result_container.results_number() if number_of_results < result_container.results_length(): number_of_results = 0 # checkin for a external bang if result_container.redirect_url: return redirect(result_container.redirect_url) # Server-Timing header request.timings = result_container.get_timings() # output for result in results: if output_format == 'html': if 'content' in result and result['content']: result['content'] = highlight_content( escape(result['content'][:1024]), search_query.query) if 'title' in result and result['title']: result['title'] = highlight_content( escape(result['title'] or ''), search_query.query) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join( html_to_text(result['title']).strip().split()) if 'url' in result: result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if result.get( 'publishedDate' ): # do not try to get a date from an empty string or a None type try: # test if publishedDate >= 1900 (datetime module bug) result['pubdate'] = result['publishedDate'].strftime( '%Y-%m-%d %H:%M:%S%z') except ValueError: result['publishedDate'] = None else: if result['publishedDate'].replace( tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now( ) - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext( '{minutes} minute(s) ago').format(minutes=minutes) else: result['publishedDate'] = gettext( '{hours} hour(s), {minutes} minute(s) ago').format( hours=hours, minutes=minutes) # noqa else: result['publishedDate'] = format_date( result['publishedDate']) if output_format == 'json': return Response( json.dumps( { 'query': search_query.query, 'number_of_results': number_of_results, 'results': results, 'answers': list(result_container.answers), 'corrections': list(result_container.corrections), 'infoboxes': result_container.infoboxes, 'suggestions': list(result_container.suggestions), 'unresponsive_engines': __get_translated_errors( result_container.unresponsive_engines) }, # noqa default=lambda item: list(item) if isinstance(item, set) else item), mimetype='application/json') elif output_format == 'csv': csv = UnicodeWriter(StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type') csv.writerow(keys) for row in results: row['host'] = row['parsed_url'].netloc row['type'] = 'result' csv.writerow([row.get(key, '') for key in keys]) for a in result_container.answers: row = {'title': a, 'type': 'answer'} csv.writerow([row.get(key, '') for key in keys]) for a in result_container.suggestions: row = {'title': a, 'type': 'suggestion'} csv.writerow([row.get(key, '') for key in keys]) for a in result_container.corrections: row = {'title': a, 'type': 'correction'} csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format( search_query.query) response.headers.add('Content-Disposition', cont_disp) return response elif output_format == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, answers=result_container.answers, corrections=result_container.corrections, suggestions=result_container.suggestions, q=request.form['q'], number_of_results=number_of_results, base_url=get_base_url(), override_theme='__common__', ) return Response(response_rss, mimetype='text/xml') # HTML output format # suggestions: use RawTextQuery to get the suggestion URLs with the same bang suggestion_urls = list( map( lambda suggestion: { 'url': raw_text_query.changeQuery(suggestion).getFullQuery(), 'title': suggestion }, result_container.suggestions)) correction_urls = list( map( lambda correction: { 'url': raw_text_query.changeQuery(correction).getFullQuery(), 'title': correction }, result_container.corrections)) # return render('results.html', results=results, q=request.form['q'], selected_categories=search_query.categories, pageno=search_query.pageno, time_range=search_query.time_range, number_of_results=format_decimal(number_of_results), suggestions=suggestion_urls, answers=result_container.answers, corrections=correction_urls, infoboxes=result_container.infoboxes, engine_data=result_container.engine_data, paging=result_container.paging, unresponsive_engines=__get_translated_errors( result_container.unresponsive_engines), current_language=match_language( search_query.lang, LANGUAGE_CODES, fallback=request.preferences.get_value("language")), base_url=get_base_url(), theme=get_current_theme_name(), favicons=global_favicons[themes.index( get_current_theme_name())], timeout_limit=request.form.get('timeout_limit', None))
def response(resp): results = [] search_res = json.loads(resp.text) content = '' heading = search_res.get('Heading', '') attributes = [] urls = [] infobox_id = None relatedTopics = [] # add answer if there is one answer = search_res.get('Answer', '') if answer != '': results.append({'answer': html_to_text(answer)}) # add infobox if 'Definition' in search_res: content = content + search_res.get('Definition', '') if 'Abstract' in search_res: content = content + search_res.get('Abstract', '') # image image = search_res.get('Image', '') image = None if image == '' else image # attributes if 'Infobox' in search_res: infobox = search_res.get('Infobox', None) if 'content' in infobox: for info in infobox.get('content'): attributes.append({'label': info.get('label'), 'value': info.get('value')}) # urls for ddg_result in search_res.get('Results', []): if 'FirstURL' in ddg_result: firstURL = ddg_result.get('FirstURL', '') text = ddg_result.get('Text', '') urls.append({'title': text, 'url': firstURL}) results.append({'title': heading, 'url': firstURL}) # related topics for ddg_result in search_res.get('RelatedTopics', []): if 'FirstURL' in ddg_result: suggestion = result_to_text(ddg_result.get('FirstURL', None), ddg_result.get('Text', None), ddg_result.get('Result', None)) if suggestion != heading: results.append({'suggestion': suggestion}) elif 'Topics' in ddg_result: suggestions = [] relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions}) for topic_result in ddg_result.get('Topics', []): suggestion = result_to_text(topic_result.get('FirstURL', None), topic_result.get('Text', None), topic_result.get('Result', None)) if suggestion != heading: suggestions.append(suggestion) # abstract abstractURL = search_res.get('AbstractURL', '') if abstractURL != '': # add as result ? problem always in english infobox_id = abstractURL urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL}) # definition definitionURL = search_res.get('DefinitionURL', '') if definitionURL != '': # add as result ? as answer ? problem always in english infobox_id = definitionURL urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) # to merge with wikidata's infobox if infobox_id: infobox_id = http_regex.sub('https:', infobox_id) # entity entity = search_res.get('Entity', None) # TODO continent / country / department / location / waterfall / # mountain range : # link to map search, get weather, near by locations # TODO musician : link to music search # TODO concert tour : ?? # TODO film / actor / television / media franchise : # links to IMDB / rottentomatoes (or scrap result) # TODO music : link tu musicbrainz / last.fm # TODO book : ?? # TODO artist / playwright : ?? # TODO compagny : ?? # TODO software / os : ?? # TODO software engineer : ?? # TODO prepared food : ?? # TODO website : ?? # TODO performing art : ?? # TODO prepared food : ?? # TODO programming language : ?? # TODO file format : ?? if len(heading) > 0: # TODO get infobox.meta.value where .label='article_title' if image is None and len(attributes) == 0 and len(urls) == 1 and\ len(relatedTopics) == 0 and len(content) == 0: results.append({ 'url': urls[0]['url'], 'title': heading, 'content': content }) else: results.append({ 'infobox': heading, 'id': infobox_id, 'entity': entity, 'content': content, 'img_src': image, 'attributes': attributes, 'urls': urls, 'relatedTopics': relatedTopics }) return results
def index(): """Render index page. Supported outputs: html, json, csv, rss. """ if request.form.get('q') is None: return render( 'index.html', ) # search search_query = None result_container = None try: search_query = get_search_query_from_webapp(request.preferences, request.form) # search = Search(search_query) # without plugins search = SearchWithPlugins(search_query, request) result_container = search.search() except: request.errors.append(gettext('search error')) logger.exception('search error') return render( 'index.html', ) results = result_container.get_ordered_results() # UI advanced_search = request.form.get('advanced_search', None) output_format = request.form.get('format', 'html') if output_format not in ['html', 'csv', 'json', 'rss']: output_format = 'html' # output for result in results: if output_format == 'html': if 'content' in result and result['content']: result['content'] = highlight_content(result['content'][:1024], search_query.query.encode('utf-8')) result['title'] = highlight_content(result['title'], search_query.query.encode('utf-8')) else: if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']).strip().split()) result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: try: # test if publishedDate >= 1900 (datetime module bug) result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') except ValueError: result['publishedDate'] = None else: if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: result['publishedDate'] = format_date(result['publishedDate']) number_of_results = result_container.results_number() if number_of_results < result_container.results_length(): number_of_results = 0 if output_format == 'json': return Response(json.dumps({'query': search_query.query, 'number_of_results': number_of_results, 'results': results}), mimetype='application/json') elif output_format == 'csv': csv = UnicodeWriter(cStringIO.StringIO()) keys = ('title', 'url', 'content', 'host', 'engine', 'score') csv.writerow(keys) for row in results: row['host'] = row['parsed_url'].netloc csv.writerow([row.get(key, '') for key in keys]) csv.stream.seek(0) response = Response(csv.stream.read(), mimetype='application/csv') cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query.encode('utf-8')) response.headers.add('Content-Disposition', cont_disp) return response elif output_format == 'rss': response_rss = render( 'opensearch_response_rss.xml', results=results, q=request.form['q'], number_of_results=number_of_results, base_url=get_base_url() ) return Response(response_rss, mimetype='text/xml') return render( 'results.html', results=results, q=request.form['q'], selected_categories=search_query.categories, pageno=search_query.pageno, time_range=search_query.time_range, number_of_results=format_decimal(number_of_results), advanced_search=advanced_search, suggestions=result_container.suggestions, answers=result_container.answers, infoboxes=result_container.infoboxes, paging=result_container.paging, base_url=get_base_url(), theme=get_current_theme_name(), favicons=global_favicons[themes.index(get_current_theme_name())] )