Ejemplo n.º 1
0
async def _find_similar_instances(searx_stats_result: SearxStatisticsResult):
    # group instance urls per ip set
    all_ips_set = dict()
    for url, detail in searx_stats_result.iter_instances(
            valid_or_private=True, network_type=NetworkType.NORMAL):
        ips = set(detail.get('network', {}).get('ips', {}).keys())
        # at least one IP
        if len(ips) > 0:
            # frozenset so it can use as a key of app_ips_set
            ips = frozenset(ips)
            urls = all_ips_set.setdefault(ips, set())
            urls.add(url)
    # set alternativeUrls
    for ips, urls in all_ips_set.items():
        if len(urls) > 1:
            # only if there are two or more instances sharing the same ips
            for url in urls:
                # for each url, create a reference to all other urls
                detail = searx_stats_result.get_instance(url)
                if 'alternativeUrls' not in detail:
                    detail['alternativeUrls'] = dict()

                for url2 in urls:
                    if url2 != url and url2 not in detail['alternativeUrls']:
                        detail['alternativeUrls'][url2] = 'same IP'
Ejemplo n.º 2
0
async def fetch(searx_stats_result: SearxStatisticsResult):

    url_to_deleted = []
    url_to_update = OrderedDict()

    # fetch and store the changes in url_to_deleted and url_to_add
    # do not modify the searx_stats_result.instances to avoid
    async def fetch_and_store_change(url: str, detail, *_, **__):
        if 'version' not in detail:
            r_url, r_detail = await fetch_one_display(
                url, searx_stats_result.private)
            dict_merge(r_detail, detail)
            if r_url != url:
                # r_url is the URL after following a HTTP redirect
                # in this case the searx_stats_result.instances[url] must be deleted.
                url_to_deleted.append(url)
            url_to_update[r_url] = r_detail

    instance_iterator = searx_stats_result.iter_instances(
        only_valid=False, valid_or_private=False)
    await for_each(instance_iterator, fetch_and_store_change, limit=1)

    # apply the changes
    for url in url_to_deleted:
        del searx_stats_result.instances[url]
    for url, detail in url_to_update.items():
        searx_stats_result.update_instance(url, detail)
Ejemplo n.º 3
0
def fetch(searx_stats_result: SearxStatisticsResult):
    ressource_hashes = {
        'index': 0
    }

    for network_type in NetworkType:
        fetch_instances(searx_stats_result, network_type, ressource_hashes)

    # create searx_json['hashes']
    searx_stats_result.hashes = [None] * ressource_hashes['index']
    for ressource_hash, ressource_desc in ressource_hashes.items():
        if ressource_hash != 'index':
            i = ressource_desc['index']
            del ressource_desc['index']
            ressource_desc['hash'] = ressource_hash
            searx_stats_result.hashes[i] = ressource_desc

    # detect fork using the static files
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        ressources = detail.get('html', {}).get('ressources')
        if ressources:
            found_forks = find_forks(detail['html']['ressources'], searx_stats_result.hashes, searx_stats_result.forks)
            if found_forks and detail['git_url'] not in found_forks:
                detail['git_url'] = found_forks[0]

    # get grade
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        if 'html' in detail:
            html = detail['html']
            html['grade'] = get_grade(html['ressources'], searx_stats_result.hashes)
Ejemplo n.º 4
0
async def fetch_from_urls(searx_result: SearxStatisticsResult,
                          instances: list):
    results = OrderedDict()
    for instance in instances:
        # basic checks
        # url may be different because of redirect
        url, detail = await fetch_one(instance)
        searx_result.update_instance(url, detail)

        # output
        http_status_code = detail.get('http').get('status_code', '') or ''
        searx_version = detail.get('version', '') or ''
        timing = detail.get('timing', {}).get('initial') or None
        cert_orgname = (detail.get('tls')
                        or {}).get('certificate',
                                   {}).get('organizationName', '')
        error = detail.get('error', '')
        if error != '':
            icon = '❌'
        elif searx_version == '':
            icon = '👽'
        else:
            icon = '🍰'
        if timing:
            timing = '{:.3f}'.format(timing)
        else:
            timing = '     '
        print('{0:3} {1} {2:20} {3} {4:60} {5:30} {6:50}'.format(
            http_status_code, icon, searx_version, timing, url, cert_orgname,
            error))
    return results
Ejemplo n.º 5
0
async def get_searx_stats_result_from_list(
        instance_urls: list, private: bool) -> SearxStatisticsResult:
    """
    Fetch searx instances from instance_urls given parameter.
    """
    searx_stats_result = SearxStatisticsResult(private=private)
    for url in instance_urls:
        url = add_slash(url)
        searx_stats_result.update_instance(url, {
            'comments': [],
            'alternativeUrls': dict()
        })
    return searx_stats_result
Ejemplo n.º 6
0
def get_address_info(searx_stats_result: SearxStatisticsResult, address: str, field_type: str, https_port: bool):
    reverse_dns, reverse_dns_error = dns_query_reverse(address)
    whois_info, whois_info_error = get_whois(address)

    result = {
        'reverse': reverse_dns,
        'field_type': field_type,
    }

    if whois_info is not None:
        asn = whois_info['asn']
        del whois_info['asn']
        result['asn'] = asn
        searx_stats_result.asns[asn] = whois_info

    if reverse_dns_error is not None:
        result['reverse_error'] = reverse_dns_error
    if whois_info_error is not None:
        result['whois_error'] = whois_info_error

    # check https ports
    if https_port:
        https_port, https_port_error = check_https_port(address)
        result['https_port'] = https_port
        if https_port_error is not None:
            result['https_port_error'] = https_port_error

    return result
Ejemplo n.º 7
0
async def fetch(instance_urls: list,
                selected_fetchers: list) -> SearxStatisticsResult:
    searx_stats_result = SearxStatisticsResult()

    # initial fetch
    await basic.fetch_from_urls(searx_stats_result, instance_urls)

    # fetch using the selected fetchers
    await fetch_using_fetchers(searx_stats_result, selected_fetchers)

    return searx_stats_result
Ejemplo n.º 8
0
def fetch(searx_stats_result: SearxStatisticsResult):
    ressource_hashes = {'index': 0}

    for network_type in NetworkType:
        fetch_instances(searx_stats_result, network_type, ressource_hashes)

    # create searx_json['hashes']
    searx_stats_result.hashes = [None] * ressource_hashes['index']
    for ressource_hash, ressource_desc in ressource_hashes.items():
        if ressource_hash != 'index':
            i = ressource_desc['index']
            del ressource_desc['index']
            ressource_desc['hash'] = ressource_hash
            searx_stats_result.hashes[i] = ressource_desc

    # get grade
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        if 'html' in detail:
            html = detail['html']
            html['grade'] = get_grade(html['ressources'],
                                      searx_stats_result.hashes)
Ejemplo n.º 9
0
async def _check_connectivity(searx_stats_result: SearxStatisticsResult):
    async def get_ip(url):
        async with new_client() as session:
            response, error = await get(session, url)
        if error is None:
            return response.text, None
        else:
            return False, error
    ipv4, ipv4_error = await get_ip(URL_IPV4)
    ipv6, ipv6_error = await get_ip(URL_IPV6)
    searx_stats_result.metadata['ips'] = {}
    if ipv4:
        searx_stats_result.metadata['ips'][ipv4] = get_address_info(searx_stats_result, ipv4, 'A', False)
    else:
        print('тЪая╕П No IPv4 connectivity ', ipv4_error)
    if ipv6:
        searx_stats_result.metadata['ips'][ipv6] = get_address_info(searx_stats_result, ipv6, 'AAAA', False)
        searx_stats_result.metadata['ipv6'] = True
    else:
        searx_stats_result.metadata['ipv6'] = False
        print('тЪая╕П No IPv6 connectivity ', ipv6_error)
Ejemplo n.º 10
0
async def get_searx_stats_result_from_repository() -> SearxStatisticsResult:
    """
    Fetch searx instances from https://github.com/searx/searx-instances/
    """
    searx_stats_result = SearxStatisticsResult(private=False)
    searx_instances = load_searx_instances()
    for url, instance in searx_instances.items():
        url = add_slash(url)
        searx_stats_result.update_instance(
            url, {
                'comments': instance.comments,
                'alternativeUrls': copy_dict_slash(instance.additional_urls),
                'main': True,
            })
        for aurl, comment in instance.additional_urls.items():
            aurl = add_slash(aurl)
            a_aurls = copy_dict_slash(instance.additional_urls)
            a_aurls[url] = ''
            if aurl in a_aurls:
                del a_aurls[aurl]
            searx_stats_result.update_instance(aurl, {
                'comments': [comment],
                'alternativeUrls': a_aurls
            })

    return searx_stats_result
Ejemplo n.º 11
0
def get_address_info(searx_stats_result: SearxStatisticsResult, address: str,
                     field_type: str, https_port: bool):
    reverse_dns, reverse_dns_error = dns_query_reverse(address)
    whois_info, whois_info_error = get_whois(address)

    result = {
        'reverse': reverse_dns,
        'field_type': field_type,
    }

    if whois_info is not None:
        # asn_cidr
        asn_cidr = whois_info['asn_cidr']
        del whois_info['asn_cidr']

        # fall back
        if whois_info['asn_description'] is None:
            whois_info['asn_description'] = whois_info['network_name']
        del whois_info['network_name']

        # overwrite the network_country with ip2location
        if MMDB_DATABASE:
            try:
                mmdb_country = MMDB_DATABASE.country(address)
                whois_info['network_country'] = mmdb_country.country.iso_code
            except (ValueError, geoip2.errors.AddressNotFoundError):
                pass
            except Exception as ex:
                print('MMDB Error', exception_to_str(ex))

        #
        result['asn_cidr'] = asn_cidr
        if asn_cidr not in searx_stats_result.cidrs:
            searx_stats_result.cidrs[asn_cidr] = whois_info
        else:
            if whois_info != searx_stats_result.cidrs[asn_cidr]:
                print('different asn info\n', whois_info, '\n',
                      searx_stats_result.cidrs[asn_cidr])

    if reverse_dns_error is not None:
        result['reverse_error'] = reverse_dns_error
    if whois_info_error is not None:
        result['whois_error'] = whois_info_error

    # check https ports
    if https_port:
        https_port, https_port_error = check_https_port(address)
        result['https_port'] = https_port
        if https_port_error is not None:
            result['https_port_error'] = https_port_error

    return result
Ejemplo n.º 12
0
async def fetch(searx_stats_result: SearxStatisticsResult):
    seen_git_url = set()
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        git_url = normalize_git_url(detail['git_url'])
        if git_url and git_url not in seen_git_url:
            try:
                await fetch_hashes_from_url(git_url)
            except Exception as ex:
                print(exception_to_str(ex))
            else:
                if git_url not in searx_stats_result.forks:
                    searx_stats_result.forks.append(git_url)
            seen_git_url.add(git_url)
Ejemplo n.º 13
0
def get_address_info(searx_stats_result: SearxStatisticsResult, address: str,
                     field_type: str, https_port: bool):
    reverse_dns, reverse_dns_error = dns_query_reverse(address)
    whois_info, whois_info_error = get_whois(address)

    result = {
        'reverse': reverse_dns,
        'field_type': field_type,
    }

    if whois_info is not None:
        # asn_cidr
        asn_cidr = whois_info['asn_cidr']
        del whois_info['asn_cidr']

        # fall back
        if whois_info['asn_description'] is None:
            whois_info['asn_description'] = whois_info['network_name']
        del whois_info['network_name']

        result['asn_cidr'] = asn_cidr
        if asn_cidr not in searx_stats_result.cidrs:
            searx_stats_result.cidrs[asn_cidr] = whois_info
        else:
            if whois_info != searx_stats_result.cidrs[asn_cidr]:
                print('different asn info\n', whois_info, '\n',
                      searx_stats_result.cidrs[asn_cidr])

    if reverse_dns_error is not None:
        result['reverse_error'] = reverse_dns_error
    if whois_info_error is not None:
        result['whois_error'] = whois_info_error

    # check https ports
    if https_port:
        https_port, https_port_error = check_https_port(address)
        result['https_port'] = https_port
        if https_port_error is not None:
            result['https_port_error'] = https_port_error

    return result
Ejemplo n.º 14
0
def fetch_instances(searx_stats_result: SearxStatisticsResult, network_type: NetworkType, ressource_hashes):
    driver = new_driver(network_type=network_type)
    try:
        for url, detail in searx_stats_result.iter_instances(only_valid=True, network_type=network_type):
            if get_network_type(url) == network_type:
                ressources = fetch_ressource_hashes(driver, url, ressource_hashes, searx_stats_result.forks)
                if 'error' in ressources:
                    # don't reuse the browser if there was an error
                    driver.quit()
                    driver = new_driver(network_type=network_type)
                # temporary storage
                detail['html'] = {
                    'ressources': ressources
                }
                # output progress
                external_js = len(ressources.get('script', []))
                inline_js = len(ressources.get('inline_script', []))
                error_msg = ressources.get('error', '').strip()
                print('🔗 {0:60} {1:3} loaded js {2:3} inline js  {3}'.format(url, external_js, inline_js, error_msg))
    finally:
        driver.quit()
Ejemplo n.º 15
0
async def fetch(searx_stats_result: SearxStatisticsResult):

    url_to_deleted = []

    async def fetch_and_set_async(url: str, detail, *_, **__):
        if 'version' not in detail:
            r_url, r_detail = await fetch_one_display(
                url, searx_stats_result.private)
            dict_merge(r_detail, detail)
            if r_url != url:
                # another r_url will never be url (the variable)
                # since r_url is the result of following HTTP redirect
                url_to_deleted.append(url)
            searx_stats_result.update_instance(r_url, r_detail)

    instance_iterator = searx_stats_result.iter_instances(
        only_valid=False, valid_or_private=False)
    await for_each(instance_iterator, fetch_and_set_async, limit=1)

    for url in url_to_deleted:
        del searx_stats_result.instances[url]
Ejemplo n.º 16
0
async def _fetch_network(searx_stats_result: SearxStatisticsResult):
    await for_each(searx_stats_result.iter_instances(only_valid=False, network_type=NetworkType.NORMAL),
                   fetch_one, searx_stats_result)
Ejemplo n.º 17
0
async def fetch(searx_stats_result: SearxStatisticsResult):
    await for_each(searx_stats_result.iter_instances(only_valid=True),
                   fetch_one, searx_stats_result)
Ejemplo n.º 18
0
async def _fetch_network(searx_stats_result: SearxStatisticsResult):
    await for_each(
        searx_stats_result.iter_instances(valid_or_private=True,
                                          network_type=NetworkType.NORMAL),
        fetch_one, searx_stats_result)