def response(resp): results = [] # First retrieve notice of each result pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\ + 'db=pubmed&retmode=xml&id={pmids_string}' pmids_results = etree.XML(resp.content) pmids = pmids_results.xpath('//eSearchResult/IdList/Id') pmids_string = '' for item in pmids: pmids_string += item.text + ',' retrieve_notice_args = dict(pmids_string=pmids_string) retrieve_url_encoded = pubmed_retrieve_api_url.format( **retrieve_notice_args) search_results_xml = get(retrieve_url_encoded).content search_results = etree.XML(search_results_xml).xpath( '//PubmedArticleSet/PubmedArticle/MedlineCitation') for entry in search_results: title = entry.xpath('.//Article/ArticleTitle')[0].text pmid = entry.xpath('.//PMID')[0].text url = pubmed_url + pmid try: content = entry.xpath('.//Abstract/AbstractText')[0].text except: content = gettext('No abstract is available for this publication.') # If a doi is available, add it to the snipppet try: doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) except: pass if len(content) > 300: content = content[0:300] + "..." # TODO: center snippet on query term res_dict = {'url': url, 'title': title, 'content': content} try: publishedDate = datetime.strptime( entry.xpath('.//DateCreated/Year')[0].text + '-' + entry.xpath('.//DateCreated/Month')[0].text + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d') res_dict['publishedDate'] = publishedDate except: pass results.append(res_dict) return results
def debug_explain_wikidata_query(query, method='GET'): if method == 'GET': http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers()) else: http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers()) http_response.raise_for_status() return http_response.content
def response(resp): results = [] htmlparser = etree.HTMLParser() html = fromstring(resp.content.decode("utf-8"), parser=htmlparser) search_results = eval_xpath(html, wikidata_ids_xpath) if resp.search_params['language'].split('-')[0] == 'all': language = 'en' else: language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 for search_result in search_results[:result_count]: wikidata_id = search_result.split('/')[-1] url = url_detail.format(query=urlencode({ 'page': wikidata_id, 'uselang': language })) htmlresponse = get(url) jsonresponse = loads(htmlresponse.content.decode("utf-8")) results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) return results
def duckduckgo(query): # duckduckgo autocompleter url = 'https://ac.duckduckgo.com/ac/?{0}&type=list' resp = loads(get(url.format(urlencode(dict(q=query)))).text) if len(resp) > 1: return resp[1] return []
def wikipedia(query): # wikipedia autocompleter url = 'https://en.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json' # noqa resp = loads(get(url.format(urlencode(dict(search=query)))).text) if len(resp) > 1: return resp[1] return []
def response(resp): results = [] # First retrieve notice of each result pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\ + 'db=pubmed&retmode=xml&id={pmids_string}' pmids_results = etree.XML(resp.content) pmids = pmids_results.xpath('//eSearchResult/IdList/Id') pmids_string = '' for item in pmids: pmids_string += item.text + ',' retrieve_notice_args = dict(pmids_string=pmids_string) retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) search_results_xml = get(retrieve_url_encoded).content search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') for entry in search_results: title = entry.xpath('.//Article/ArticleTitle')[0].text pmid = entry.xpath('.//PMID')[0].text url = pubmed_url + pmid try: content = entry.xpath('.//Abstract/AbstractText')[0].text except: content = gettext('No abstract is available for this publication.') # If a doi is available, add it to the snipppet try: doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) except: pass if len(content) > 300: content = content[0:300] + "..." # TODO: center snippet on query term res_dict = {'url': url, 'title': title, 'content': content} try: publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text + '-' + entry.xpath('.//DateCreated/Month')[0].text + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d') res_dict['publishedDate'] = publishedDate except: pass results.append(res_dict) return results
def get_vqd(query, headers): query_url = site_url.format(query=urlencode({'q': query})) res = get(query_url, headers=headers) content = res.text if content.find('vqd=\'') == -1: raise Exception('Request failed') vqd = content[content.find('vqd=\'') + 5:] vqd = vqd[:vqd.find('\'')] return vqd
def send_wikidata_query(query, method='GET'): if method == 'GET': # query will be cached by wikidata http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers()) else: # query won't be cached by wikidata http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers()) if http_response.status_code != 200: logger.debug('SPARQL endpoint error %s', http_response.content.decode()) logger.debug('request time %s', str(http_response.elapsed)) http_response.raise_for_status() return loads(http_response.content.decode())
def request(query, params): response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) dom = html.fromstring(response_index.text) url_params = {'q': query} for e in eval_xpath_list(dom, '//input[@type="hidden"]'): name = e.get('name') value = e.get('value') url_params[name] = value params['url'] = base_url + '?' + urlencode(url_params) params['cookies'] = response_index.cookies return params
def google(query): # google autocompleter autocomplete_url = 'http://suggestqueries.google.com/complete/search?client=toolbar&' # noqa response = get(autocomplete_url + urlencode(dict(q=query))) results = [] if response.ok: dom = etree.fromstring(response.text) results = dom.xpath('//suggestion/@data') return results
def dbpedia(query): # dbpedia autocompleter autocomplete_url = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' # noqa response = get(autocomplete_url + urlencode(dict(QueryString=query))) results = [] if response.ok: dom = etree.fromstring(response.content) results = dom.xpath('//a:Result/a:Label//text()', namespaces={'a': 'http://lookup.dbpedia.org/'}) return results
def response(resp): if resp.status_code == 303: return [] # ping headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) get(url_ping, headers=headers_ping) # parse the response results = [] doc = fromstring(resp.text) for i, r in enumerate(eval_xpath(doc, result_xpath)): if i >= 30: break try: res_url = eval_xpath(r, url_xpath)[-1] except: continue if not res_url: continue title = extract_text(eval_xpath(r, title_xpath)) content = extract_text(eval_xpath(r, content_xpath)) # append result results.append({'title': title, 'content': content, 'url': res_url}) # parse correction for correction in eval_xpath(doc, correction_xpath): # append correction results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): results = [] html = fromstring(resp.text) wikidata_ids = html.xpath(wikidata_ids_xpath) language = resp.search_params['language'].split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 for wikidata_id in wikidata_ids[:result_count]: url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) jsonresponse = loads(htmlresponse.text) results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results
def response(resp): results = [] html = fromstring(resp.text) wikidata_ids = html.xpath(wikidata_ids_xpath) language = match_language(resp.search_params['language'], supported_languages).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 for wikidata_id in wikidata_ids[:result_count]: url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) jsonresponse = loads(htmlresponse.text) results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results
def _is_url_image(image_url): if not isinstance(image_url, str): return False if image_url.startswith('//'): image_url = 'https:' + image_url if image_url.startswith('data:'): return image_url.startswith('data:image/') if not _is_url(image_url): return False retry = 2 while retry > 0: a = time() try: poolrequests.set_timeout_for_thread(10.0, time()) r = poolrequests.get( image_url, timeout=10.0, allow_redirects=True, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-GPC': '1', 'Cache-Control': 'max-age=0' }) if r.headers["content-type"].startswith('image/'): return True return False except requests.exceptions.Timeout: logger.error('Timeout for %s: %i', image_url, int(time() - a)) retry -= 1 except requests.exceptions.RequestException: logger.exception('Exception for %s', image_url) return False
def response(resp): results = [] html = fromstring(resp.text) search_results = html.xpath(wikidata_ids_xpath) if resp.search_params['language'].split('-')[0] == 'all': language = 'en' else: language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 for search_result in search_results[:result_count]: wikidata_id = search_result.split('/')[-1] url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) jsonresponse = loads(htmlresponse.text) results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results
def response(resp): results = [] search_res = json.loads(resp.text) wikidata_ids = set() for r in search_res.get('query', {}).get('search', {}): wikidata_ids.add(r.get('title', '')) language = resp.search_params['language'].split('_')[0] if language == 'all': language = 'en' url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids), 'languages': language + '|en'})) htmlresponse = get(url) jsonresponse = json.loads(htmlresponse.content) for wikidata_id in wikidata_ids: results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results
def get_vqd(query): res = get(site_url.format(query=urlencode({'q': query}))) content = res.text vqd = content[content.find('vqd=\'') + 5:] vqd = vqd[:vqd.find('\'')] return vqd
def load_engine(engine_data): engine_name = engine_data['name'] if '_' in engine_name: logger.error( 'Engine name contains underscore: "{}"'.format(engine_name)) sys.exit(1) if engine_name.lower() != engine_name: logger.warn( 'Engine name is not lowercase: "{}", converting to lowercase'. format(engine_name)) engine_name = engine_name.lower() engine_data['name'] = engine_name engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', engine_dir) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): logger.exception( 'Fatal exception in engine "{}"'.format(engine_module)) sys.exit(1) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name, param_value in engine_data.items(): if param_name == 'engine': pass elif param_name == 'categories': if param_value == 'none': engine.categories = [] else: engine.categories = list(map(str.strip, param_value.split(','))) elif param_name == 'proxies': engine.proxies = get_proxy_cycles(param_value) else: setattr(engine, param_name, param_value) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) # assign supported languages from json file if engine_data['name'] in ENGINES_LANGUAGES: setattr(engine, 'supported_languages', ENGINES_LANGUAGES[engine_data['name']]) # find custom aliases for non standard language codes if hasattr(engine, 'supported_languages'): if hasattr(engine, 'language_aliases'): language_aliases = getattr(engine, 'language_aliases') else: language_aliases = {} for engine_lang in getattr(engine, 'supported_languages'): iso_lang = match_language(engine_lang, babel_langs, fallback=None) if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ iso_lang not in getattr(engine, 'supported_languages'): language_aliases[iso_lang] = engine_lang setattr(engine, 'language_aliases', language_aliases) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): setattr( engine, 'fetch_supported_languages', lambda: engine._fetch_supported_languages( get(engine.supported_languages_url))) engine.stats = { 'sent_search_count': 0, # sent search 'search_count': 0, # succesful search 'result_count': 0, 'engine_time': 0, 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } engine_type = getattr(engine, 'engine_type', 'online') if engine_type != 'offline': engine.stats['page_load_time'] = 0 engine.stats['page_load_count'] = 0 # tor related settings if settings['outgoing'].get('using_tor_proxy'): # use onion url if using tor. if hasattr(engine, 'onion_url'): engine.search_url = engine.onion_url + getattr( engine, 'search_path', '') elif 'onions' in engine.categories: # exclude onion engines if not using tor. return None engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def get_google_nid_cookie(google_hostname): global nid_cookie if google_hostname not in nid_cookie: resp = get('https://' + google_hostname) nid_cookie[google_hostname] = resp.cookies.get("NID", None) return nid_cookie[google_hostname]
def get_google_pref_cookie(): global pref_cookie if pref_cookie == '': resp = get('https://www.google.com/ncr', allow_redirects=False) pref_cookie = resp.cookies["PREF"] return pref_cookie
def init(engine_settings=None): # pylint: disable=unused-argument parse_extra_param(get(base_url + extra_param_path).text)
def load_engine(engine_data): engine_name = engine_data['name'] if '_' in engine_name: logger.error( 'Engine name contains underscore: "{}"'.format(engine_name)) sys.exit(1) if engine_name.lower() != engine_name: logger.warn( 'Engine name is not lowercase: "{}", converting to lowercase'. format(engine_name)) engine_name = engine_name.lower() engine_data['name'] = engine_name engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', engine_dir) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError) as e: logger.exception( 'Fatal exception in engine "{}"'.format(engine_module)) sys.exit(1) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name in engine_data: if param_name == 'engine': continue if param_name == 'categories': if engine_data['categories'] == 'none': engine.categories = [] else: engine.categories = list( map(str.strip, engine_data['categories'].split(','))) continue setattr(engine, param_name, engine_data[param_name]) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) # assign supported languages from json file if engine_data['name'] in languages: setattr(engine, 'supported_languages', languages[engine_data['name']]) # find custom aliases for non standard language codes if hasattr(engine, 'supported_languages'): if hasattr(engine, 'language_aliases'): language_aliases = getattr(engine, 'language_aliases') else: language_aliases = {} for engine_lang in getattr(engine, 'supported_languages'): iso_lang = match_language(engine_lang, babel_langs, fallback=None) if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ iso_lang not in getattr(engine, 'supported_languages'): language_aliases[iso_lang] = engine_lang setattr(engine, 'language_aliases', language_aliases) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): setattr( engine, 'fetch_supported_languages', lambda: engine._fetch_supported_languages( get(engine.supported_languages_url))) engine.stats = { 'result_count': 0, 'search_count': 0, 'engine_time': 0, 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } if not engine.offline: engine.stats['page_load_time'] = 0 engine.stats['page_load_count'] = 0 for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def init(engine_settings=None): parse_extra_param( get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10' ).text)