Beispiel #1
0
def test_get_host():
    assert http.get_host(
        'https://en.wikipedia.org/wiki/Searx') == 'en.wikipedia.org'
    assert http.get_host('https://www.wikidata.org/wiki/Wikidata:Main_Page'
                         ) == 'www.wikidata.org'
    assert http.get_host('https://en.wikipedia.org/wiki/Metasearch_engine'
                         ) == 'en.wikipedia.org'
Beispiel #2
0
async def fetch_one(instance_url):
    detail = dict()
    # no cookie ( cookies=DEFAULT_COOKIES,  )
    try:
        network_type = get_network_type(instance_url)
        async with new_client(network_type=network_type) as session:
            response, error = await get(session,
                                        instance_url,
                                        headers=DEFAULT_HEADERS,
                                        timeout=10)
            if response is not None:
                version = await get_searx_version(response)
                detail = {
                    'network_type': network_type.name.lower(),
                    'http': {
                        'status_code': response.status_code,
                        'error': error
                    },
                    'version': version,
                    'timing': {
                        'initial': response.elapsed.total_seconds()
                    },
                    'alternativeUrls': {},
                }
                response_url = str(response.url)
                # add trailing slash
                if not response_url.endswith('/'):
                    response_url = response_url + '/'
                # redirect
                if response_url != instance_url:
                    if 'redirect_from' not in detail:
                        detail['redirect_from'] = []
                    detail['alternativeUrls'][instance_url] = 'redirect'
                    instance_url = response_url
            else:
                detail = {
                    'network_type': network_type.name.lower(),
                    'http': {
                        'status_code': None,
                        'error': error
                    },
                    'version': None,
                    'timing': {},
                    'alternativeUrls': {},
                }
    except concurrent.futures.TimeoutError:
        # This exception occurs on new_client()
        detail['error'] = 'Timeout error'

    if error is not None:
        detail['error'] = error

    if network_type == NetworkType.NORMAL:
        detail['tls'] = get_ssl_info(get_host(instance_url))
    return instance_url, detail
Beispiel #3
0
async def fetch_one(instance_url: str, private: bool) -> dict:
    # no cookie ( cookies=DEFAULT_COOKIES,  )
    network_type = get_network_type(instance_url)
    detail = {
        'network_type': network_type.name.lower(),
        'http': {},
        'version': None,
    }
    try:
        async with new_client(network_type=network_type) as session:
            response, error = await get(session,
                                        instance_url,
                                        headers=DEFAULT_HEADERS,
                                        timeout=10)
            status_code = response.status_code if response is not None else None
            detail['http'] = {
                'status_code': status_code,
                'error': error,
            }
            if response is not None:
                response_url = str(response.url)
                # add trailing slash
                if not response_url.endswith('/'):
                    response_url = response_url + '/'
                # redirect
                if 'alternativeUrls' not in detail:
                    detail['alternativeUrls'] = dict()
                if response_url != instance_url:
                    detail['alternativeUrls'][instance_url] = 'redirect from'
                    instance_url = response_url

                # get the searx version
                if error is None:
                    await asyncio.sleep(0.5)
                    await set_searx_version(detail, session, response_url,
                                            response)

                # set initial response time
                detail['timing'] = {}
                response_time_stats = ResponseTimeStats()
                response_time_stats.add_response(response)
                detail['timing']['initial'] = response_time_stats.get()
    except concurrent.futures.TimeoutError:
        # This exception occurs on new_client()
        error = 'Timeout error'

    if (detail['version'] is not None
            or private) and network_type == NetworkType.NORMAL:
        detail['tls'] = get_ssl_info(get_host(instance_url))

    if error is not None:
        detail['http']['error'] = error
        detail['error'] = error

    return instance_url, detail
Beispiel #4
0
def normalize_url(url):
    if url.startswith('https://www.ssllabs.com/') or \
       url.startswith('https://hstspreload.org/') or \
       url.startswith('https://geti2p.net/') or \
       url.endswith('/cert/'):
        return None
    if url.endswith('/'):
        url = url[:-1]
    if url.endswith('/search'):
        url = url[:-7]
    # Remove .i2p (keep .onion URL)
    host = get_host(url)
    if host.endswith('.i2p'):
        return None
    url = url + '/'
    return url
Beispiel #5
0
async def get_instance_urls():
    instance_urls = []

    # fetch html page
    async with new_client() as session:
        response = await session.get(SEARX_INSTANCES_URL,
                                     headers=DEFAULT_HEADERS,
                                     cookies=DEFAULT_COOKIES,
                                     timeout=10)
    html = await html_fromstring(response.text)
    # remove content before MARKDOWN_ELEMENTS_XPATH
    for element in MARKDOWN_ELEMENTS_XPATH(html)[0].getchildren():
        text = stringify_children(element)
        if text.lower().find(REMOVE_BEFORE_LOWER_CASE) >= 0:
            break
        element.clear()
    # check all links
    for aelement in INSTANCES_XPATH(html):
        ahref = aelement.get('href')
        if ahref.startswith('https://www.ssllabs.com/') or \
           ahref.startswith('https://hstspreload.org/') or \
           ahref.startswith('https://geti2p.net/') or \
           ahref.endswith('/cert/'):
            continue
        if ahref.endswith('/'):
            ahref = ahref[:-1]
        if ahref.endswith('/search'):
            ahref = ahref[:-7]
        # Remove .i2p (keep .onion URL)
        host = get_host(ahref)
        if host.endswith('.i2p'):
            continue
        ahref = ahref + '/'
        instance_urls.append(ahref)

    # remove duplicates
    instance_urls = list(set(instance_urls))

    # sort list
    instance_urls.sort()

    #
    return instance_urls
Beispiel #6
0
async def fetch_one(url: str) -> dict:
    instance_host = get_host(url)
    grade, grade_url = await analyze(instance_host)
    print('📄 {0:30} {1}'.format(instance_host, grade))
    return {'grade': grade, 'gradeUrl': grade_url}
Beispiel #7
0
def fetch_one(searx_stats_result: SearxStatisticsResult, url: str, detail):
    instance_host = get_host(url)
    network_detail = get_network_info(searx_stats_result, instance_host)
    detail['network'] = network_detail
    print('ЁЯМП {0:30} {1}'.format(instance_host,
                                   network_detail.get('error', '')))
Beispiel #8
0
def fetch_one(url: str) -> dict:
    instance_host = get_host(url)
    dnssec_result = validate(instance_host)
    print('🌏 {0:30} {1}'.format(instance_host, dnssec_result))
    return dnssec_result