Exemple #1
0
def u_test_is_link():
    assert not (is_link("anyCharactersThatDontEndWithBAD_TYPES", [], [])
                )  # faulty URL
    assert not (is_link("", [], []))  # empty string
    assert not (is_link("     ", [], []))  # whitespace only
    assert not (is_link(";&&#!!!!", [], []))  # special characters
    assert (is_link("http://www.example.com", [], []))  # valid link
    assert (is_link("https://github.com/viliau", [], []))  # valid link
Exemple #2
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host,
                         proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
Exemple #3
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
def extractor(url):
    """从响应体中提取具体的信息"""
    response = requester(
        url, main_url, delay, cook, headers, timeout, host, ninja, user_agents,
        failed, processed)  # 这里涉及到 core/requester.py 文件中的 requester() 函数

    matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response)
    for link in matches:
        # 移除"#"后的所有内容以处理页内锚点
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # 检查这些 URLs 是否应该被爬取
        if is_link(link, processed,
                   files):  # 这里涉及到 core/utils.py 文件中的 is_link() 函数
            if link[:4] == 'http':
                if link.startswith(main_url):
                    internal.add(link)
                else:
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    internal.add(schema + link)
                else:
                    external.add(link)
            elif link[:1] == '/':
                internal.add(main_url + link)
            else:
                internal.add(main_url + '/' + link)

    if not only_urls:
        intel_extractor(response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex,
              custom)  # 这里涉及到 core/utils.py 文件中的 regxy() 函数
    if api:
        matches = re.findall(r'[\w-]{16,45}', response)
        for match in matches:
            if entropy(match) >= 4:  # 这里涉及到 core/utils.py 文件中的 entropy() 函数
                keys.add(url + ': ' + match)