def test_get_host(): assert http.get_host( 'https://en.wikipedia.org/wiki/Searx') == 'en.wikipedia.org' assert http.get_host('https://www.wikidata.org/wiki/Wikidata:Main_Page' ) == 'www.wikidata.org' assert http.get_host('https://en.wikipedia.org/wiki/Metasearch_engine' ) == 'en.wikipedia.org'
async def fetch_one(instance_url): detail = dict() # no cookie ( cookies=DEFAULT_COOKIES, ) try: network_type = get_network_type(instance_url) async with new_client(network_type=network_type) as session: response, error = await get(session, instance_url, headers=DEFAULT_HEADERS, timeout=10) if response is not None: version = await get_searx_version(response) detail = { 'network_type': network_type.name.lower(), 'http': { 'status_code': response.status_code, 'error': error }, 'version': version, 'timing': { 'initial': response.elapsed.total_seconds() }, 'alternativeUrls': {}, } response_url = str(response.url) # add trailing slash if not response_url.endswith('/'): response_url = response_url + '/' # redirect if response_url != instance_url: if 'redirect_from' not in detail: detail['redirect_from'] = [] detail['alternativeUrls'][instance_url] = 'redirect' instance_url = response_url else: detail = { 'network_type': network_type.name.lower(), 'http': { 'status_code': None, 'error': error }, 'version': None, 'timing': {}, 'alternativeUrls': {}, } except concurrent.futures.TimeoutError: # This exception occurs on new_client() detail['error'] = 'Timeout error' if error is not None: detail['error'] = error if network_type == NetworkType.NORMAL: detail['tls'] = get_ssl_info(get_host(instance_url)) return instance_url, detail
async def fetch_one(instance_url: str, private: bool) -> dict: # no cookie ( cookies=DEFAULT_COOKIES, ) network_type = get_network_type(instance_url) detail = { 'network_type': network_type.name.lower(), 'http': {}, 'version': None, } try: async with new_client(network_type=network_type) as session: response, error = await get(session, instance_url, headers=DEFAULT_HEADERS, timeout=10) status_code = response.status_code if response is not None else None detail['http'] = { 'status_code': status_code, 'error': error, } if response is not None: response_url = str(response.url) # add trailing slash if not response_url.endswith('/'): response_url = response_url + '/' # redirect if 'alternativeUrls' not in detail: detail['alternativeUrls'] = dict() if response_url != instance_url: detail['alternativeUrls'][instance_url] = 'redirect from' instance_url = response_url # get the searx version if error is None: await asyncio.sleep(0.5) await set_searx_version(detail, session, response_url, response) # set initial response time detail['timing'] = {} response_time_stats = ResponseTimeStats() response_time_stats.add_response(response) detail['timing']['initial'] = response_time_stats.get() except concurrent.futures.TimeoutError: # This exception occurs on new_client() error = 'Timeout error' if (detail['version'] is not None or private) and network_type == NetworkType.NORMAL: detail['tls'] = get_ssl_info(get_host(instance_url)) if error is not None: detail['http']['error'] = error detail['error'] = error return instance_url, detail
def normalize_url(url): if url.startswith('https://www.ssllabs.com/') or \ url.startswith('https://hstspreload.org/') or \ url.startswith('https://geti2p.net/') or \ url.endswith('/cert/'): return None if url.endswith('/'): url = url[:-1] if url.endswith('/search'): url = url[:-7] # Remove .i2p (keep .onion URL) host = get_host(url) if host.endswith('.i2p'): return None url = url + '/' return url
async def get_instance_urls(): instance_urls = [] # fetch html page async with new_client() as session: response = await session.get(SEARX_INSTANCES_URL, headers=DEFAULT_HEADERS, cookies=DEFAULT_COOKIES, timeout=10) html = await html_fromstring(response.text) # remove content before MARKDOWN_ELEMENTS_XPATH for element in MARKDOWN_ELEMENTS_XPATH(html)[0].getchildren(): text = stringify_children(element) if text.lower().find(REMOVE_BEFORE_LOWER_CASE) >= 0: break element.clear() # check all links for aelement in INSTANCES_XPATH(html): ahref = aelement.get('href') if ahref.startswith('https://www.ssllabs.com/') or \ ahref.startswith('https://hstspreload.org/') or \ ahref.startswith('https://geti2p.net/') or \ ahref.endswith('/cert/'): continue if ahref.endswith('/'): ahref = ahref[:-1] if ahref.endswith('/search'): ahref = ahref[:-7] # Remove .i2p (keep .onion URL) host = get_host(ahref) if host.endswith('.i2p'): continue ahref = ahref + '/' instance_urls.append(ahref) # remove duplicates instance_urls = list(set(instance_urls)) # sort list instance_urls.sort() # return instance_urls
async def fetch_one(url: str) -> dict: instance_host = get_host(url) grade, grade_url = await analyze(instance_host) print('📄 {0:30} {1}'.format(instance_host, grade)) return {'grade': grade, 'gradeUrl': grade_url}
def fetch_one(searx_stats_result: SearxStatisticsResult, url: str, detail): instance_host = get_host(url) network_detail = get_network_info(searx_stats_result, instance_host) detail['network'] = network_detail print('ЁЯМП {0:30} {1}'.format(instance_host, network_detail.get('error', '')))
def fetch_one(url: str) -> dict: instance_host = get_host(url) dnssec_result = validate(instance_host) print('🌏 {0:30} {1}'.format(instance_host, dnssec_result)) return dnssec_result