async def _find_similar_instances(searx_stats_result: SearxStatisticsResult): # group instance urls per ip set all_ips_set = dict() for url, detail in searx_stats_result.iter_instances( valid_or_private=True, network_type=NetworkType.NORMAL): ips = set(detail.get('network', {}).get('ips', {}).keys()) # at least one IP if len(ips) > 0: # frozenset so it can use as a key of app_ips_set ips = frozenset(ips) urls = all_ips_set.setdefault(ips, set()) urls.add(url) # set alternativeUrls for ips, urls in all_ips_set.items(): if len(urls) > 1: # only if there are two or more instances sharing the same ips for url in urls: # for each url, create a reference to all other urls detail = searx_stats_result.get_instance(url) if 'alternativeUrls' not in detail: detail['alternativeUrls'] = dict() for url2 in urls: if url2 != url and url2 not in detail['alternativeUrls']: detail['alternativeUrls'][url2] = 'same IP'
async def fetch(searx_stats_result: SearxStatisticsResult): url_to_deleted = [] url_to_update = OrderedDict() # fetch and store the changes in url_to_deleted and url_to_add # do not modify the searx_stats_result.instances to avoid async def fetch_and_store_change(url: str, detail, *_, **__): if 'version' not in detail: r_url, r_detail = await fetch_one_display( url, searx_stats_result.private) dict_merge(r_detail, detail) if r_url != url: # r_url is the URL after following a HTTP redirect # in this case the searx_stats_result.instances[url] must be deleted. url_to_deleted.append(url) url_to_update[r_url] = r_detail instance_iterator = searx_stats_result.iter_instances( only_valid=False, valid_or_private=False) await for_each(instance_iterator, fetch_and_store_change, limit=1) # apply the changes for url in url_to_deleted: del searx_stats_result.instances[url] for url, detail in url_to_update.items(): searx_stats_result.update_instance(url, detail)
def fetch(searx_stats_result: SearxStatisticsResult): ressource_hashes = { 'index': 0 } for network_type in NetworkType: fetch_instances(searx_stats_result, network_type, ressource_hashes) # create searx_json['hashes'] searx_stats_result.hashes = [None] * ressource_hashes['index'] for ressource_hash, ressource_desc in ressource_hashes.items(): if ressource_hash != 'index': i = ressource_desc['index'] del ressource_desc['index'] ressource_desc['hash'] = ressource_hash searx_stats_result.hashes[i] = ressource_desc # detect fork using the static files for _, detail in searx_stats_result.iter_instances(only_valid=True): ressources = detail.get('html', {}).get('ressources') if ressources: found_forks = find_forks(detail['html']['ressources'], searx_stats_result.hashes, searx_stats_result.forks) if found_forks and detail['git_url'] not in found_forks: detail['git_url'] = found_forks[0] # get grade for _, detail in searx_stats_result.iter_instances(only_valid=True): if 'html' in detail: html = detail['html'] html['grade'] = get_grade(html['ressources'], searx_stats_result.hashes)
async def fetch_from_urls(searx_result: SearxStatisticsResult, instances: list): results = OrderedDict() for instance in instances: # basic checks # url may be different because of redirect url, detail = await fetch_one(instance) searx_result.update_instance(url, detail) # output http_status_code = detail.get('http').get('status_code', '') or '' searx_version = detail.get('version', '') or '' timing = detail.get('timing', {}).get('initial') or None cert_orgname = (detail.get('tls') or {}).get('certificate', {}).get('organizationName', '') error = detail.get('error', '') if error != '': icon = '❌' elif searx_version == '': icon = '👽' else: icon = '🍰' if timing: timing = '{:.3f}'.format(timing) else: timing = ' ' print('{0:3} {1} {2:20} {3} {4:60} {5:30} {6:50}'.format( http_status_code, icon, searx_version, timing, url, cert_orgname, error)) return results
async def get_searx_stats_result_from_list( instance_urls: list, private: bool) -> SearxStatisticsResult: """ Fetch searx instances from instance_urls given parameter. """ searx_stats_result = SearxStatisticsResult(private=private) for url in instance_urls: url = add_slash(url) searx_stats_result.update_instance(url, { 'comments': [], 'alternativeUrls': dict() }) return searx_stats_result
def get_address_info(searx_stats_result: SearxStatisticsResult, address: str, field_type: str, https_port: bool): reverse_dns, reverse_dns_error = dns_query_reverse(address) whois_info, whois_info_error = get_whois(address) result = { 'reverse': reverse_dns, 'field_type': field_type, } if whois_info is not None: asn = whois_info['asn'] del whois_info['asn'] result['asn'] = asn searx_stats_result.asns[asn] = whois_info if reverse_dns_error is not None: result['reverse_error'] = reverse_dns_error if whois_info_error is not None: result['whois_error'] = whois_info_error # check https ports if https_port: https_port, https_port_error = check_https_port(address) result['https_port'] = https_port if https_port_error is not None: result['https_port_error'] = https_port_error return result
async def fetch(instance_urls: list, selected_fetchers: list) -> SearxStatisticsResult: searx_stats_result = SearxStatisticsResult() # initial fetch await basic.fetch_from_urls(searx_stats_result, instance_urls) # fetch using the selected fetchers await fetch_using_fetchers(searx_stats_result, selected_fetchers) return searx_stats_result
def fetch(searx_stats_result: SearxStatisticsResult): ressource_hashes = {'index': 0} for network_type in NetworkType: fetch_instances(searx_stats_result, network_type, ressource_hashes) # create searx_json['hashes'] searx_stats_result.hashes = [None] * ressource_hashes['index'] for ressource_hash, ressource_desc in ressource_hashes.items(): if ressource_hash != 'index': i = ressource_desc['index'] del ressource_desc['index'] ressource_desc['hash'] = ressource_hash searx_stats_result.hashes[i] = ressource_desc # get grade for _, detail in searx_stats_result.iter_instances(only_valid=True): if 'html' in detail: html = detail['html'] html['grade'] = get_grade(html['ressources'], searx_stats_result.hashes)
async def _check_connectivity(searx_stats_result: SearxStatisticsResult): async def get_ip(url): async with new_client() as session: response, error = await get(session, url) if error is None: return response.text, None else: return False, error ipv4, ipv4_error = await get_ip(URL_IPV4) ipv6, ipv6_error = await get_ip(URL_IPV6) searx_stats_result.metadata['ips'] = {} if ipv4: searx_stats_result.metadata['ips'][ipv4] = get_address_info(searx_stats_result, ipv4, 'A', False) else: print('тЪая╕П No IPv4 connectivity ', ipv4_error) if ipv6: searx_stats_result.metadata['ips'][ipv6] = get_address_info(searx_stats_result, ipv6, 'AAAA', False) searx_stats_result.metadata['ipv6'] = True else: searx_stats_result.metadata['ipv6'] = False print('тЪая╕П No IPv6 connectivity ', ipv6_error)
async def get_searx_stats_result_from_repository() -> SearxStatisticsResult: """ Fetch searx instances from https://github.com/searx/searx-instances/ """ searx_stats_result = SearxStatisticsResult(private=False) searx_instances = load_searx_instances() for url, instance in searx_instances.items(): url = add_slash(url) searx_stats_result.update_instance( url, { 'comments': instance.comments, 'alternativeUrls': copy_dict_slash(instance.additional_urls), 'main': True, }) for aurl, comment in instance.additional_urls.items(): aurl = add_slash(aurl) a_aurls = copy_dict_slash(instance.additional_urls) a_aurls[url] = '' if aurl in a_aurls: del a_aurls[aurl] searx_stats_result.update_instance(aurl, { 'comments': [comment], 'alternativeUrls': a_aurls }) return searx_stats_result
def get_address_info(searx_stats_result: SearxStatisticsResult, address: str, field_type: str, https_port: bool): reverse_dns, reverse_dns_error = dns_query_reverse(address) whois_info, whois_info_error = get_whois(address) result = { 'reverse': reverse_dns, 'field_type': field_type, } if whois_info is not None: # asn_cidr asn_cidr = whois_info['asn_cidr'] del whois_info['asn_cidr'] # fall back if whois_info['asn_description'] is None: whois_info['asn_description'] = whois_info['network_name'] del whois_info['network_name'] # overwrite the network_country with ip2location if MMDB_DATABASE: try: mmdb_country = MMDB_DATABASE.country(address) whois_info['network_country'] = mmdb_country.country.iso_code except (ValueError, geoip2.errors.AddressNotFoundError): pass except Exception as ex: print('MMDB Error', exception_to_str(ex)) # result['asn_cidr'] = asn_cidr if asn_cidr not in searx_stats_result.cidrs: searx_stats_result.cidrs[asn_cidr] = whois_info else: if whois_info != searx_stats_result.cidrs[asn_cidr]: print('different asn info\n', whois_info, '\n', searx_stats_result.cidrs[asn_cidr]) if reverse_dns_error is not None: result['reverse_error'] = reverse_dns_error if whois_info_error is not None: result['whois_error'] = whois_info_error # check https ports if https_port: https_port, https_port_error = check_https_port(address) result['https_port'] = https_port if https_port_error is not None: result['https_port_error'] = https_port_error return result
async def fetch(searx_stats_result: SearxStatisticsResult): seen_git_url = set() for _, detail in searx_stats_result.iter_instances(only_valid=True): git_url = normalize_git_url(detail['git_url']) if git_url and git_url not in seen_git_url: try: await fetch_hashes_from_url(git_url) except Exception as ex: print(exception_to_str(ex)) else: if git_url not in searx_stats_result.forks: searx_stats_result.forks.append(git_url) seen_git_url.add(git_url)
def get_address_info(searx_stats_result: SearxStatisticsResult, address: str, field_type: str, https_port: bool): reverse_dns, reverse_dns_error = dns_query_reverse(address) whois_info, whois_info_error = get_whois(address) result = { 'reverse': reverse_dns, 'field_type': field_type, } if whois_info is not None: # asn_cidr asn_cidr = whois_info['asn_cidr'] del whois_info['asn_cidr'] # fall back if whois_info['asn_description'] is None: whois_info['asn_description'] = whois_info['network_name'] del whois_info['network_name'] result['asn_cidr'] = asn_cidr if asn_cidr not in searx_stats_result.cidrs: searx_stats_result.cidrs[asn_cidr] = whois_info else: if whois_info != searx_stats_result.cidrs[asn_cidr]: print('different asn info\n', whois_info, '\n', searx_stats_result.cidrs[asn_cidr]) if reverse_dns_error is not None: result['reverse_error'] = reverse_dns_error if whois_info_error is not None: result['whois_error'] = whois_info_error # check https ports if https_port: https_port, https_port_error = check_https_port(address) result['https_port'] = https_port if https_port_error is not None: result['https_port_error'] = https_port_error return result
def fetch_instances(searx_stats_result: SearxStatisticsResult, network_type: NetworkType, ressource_hashes): driver = new_driver(network_type=network_type) try: for url, detail in searx_stats_result.iter_instances(only_valid=True, network_type=network_type): if get_network_type(url) == network_type: ressources = fetch_ressource_hashes(driver, url, ressource_hashes, searx_stats_result.forks) if 'error' in ressources: # don't reuse the browser if there was an error driver.quit() driver = new_driver(network_type=network_type) # temporary storage detail['html'] = { 'ressources': ressources } # output progress external_js = len(ressources.get('script', [])) inline_js = len(ressources.get('inline_script', [])) error_msg = ressources.get('error', '').strip() print('🔗 {0:60} {1:3} loaded js {2:3} inline js {3}'.format(url, external_js, inline_js, error_msg)) finally: driver.quit()
async def fetch(searx_stats_result: SearxStatisticsResult): url_to_deleted = [] async def fetch_and_set_async(url: str, detail, *_, **__): if 'version' not in detail: r_url, r_detail = await fetch_one_display( url, searx_stats_result.private) dict_merge(r_detail, detail) if r_url != url: # another r_url will never be url (the variable) # since r_url is the result of following HTTP redirect url_to_deleted.append(url) searx_stats_result.update_instance(r_url, r_detail) instance_iterator = searx_stats_result.iter_instances( only_valid=False, valid_or_private=False) await for_each(instance_iterator, fetch_and_set_async, limit=1) for url in url_to_deleted: del searx_stats_result.instances[url]
async def _fetch_network(searx_stats_result: SearxStatisticsResult): await for_each(searx_stats_result.iter_instances(only_valid=False, network_type=NetworkType.NORMAL), fetch_one, searx_stats_result)
async def fetch(searx_stats_result: SearxStatisticsResult): await for_each(searx_stats_result.iter_instances(only_valid=True), fetch_one, searx_stats_result)
async def _fetch_network(searx_stats_result: SearxStatisticsResult): await for_each( searx_stats_result.iter_instances(valid_or_private=True, network_type=NetworkType.NORMAL), fetch_one, searx_stats_result)