Beispiel #1
0
    def evaluate_split(self, pre_dist, post_dist):
        pre = []
        for class_value, mass in pre_dist.items():
            pre.append(pre_dist[class_value].weight)
        pre_entropy = utils.entropy(pre)

        dist_weights = []
        total_weight = 0.0
        for i in range(len(post_dist)):
            dist_weights.append(self.sum(post_dist[i]))
            total_weight += dist_weights[i]

        frac_count = 0
        for d in dist_weights:
            if d / total_weight > self._min_frac_weight_for_two_branches:
                frac_count += 1

        if frac_count < 2:
            return -np.Inf

        post_entropy = 0
        for i in range(len(post_dist)):
            d = post_dist[i]
            post = []
            for class_value, mass in d.items():
                post.append(mass.weight)
            post_entropy += dist_weights[i] * utils.entropy(post)

        if total_weight > 0:
            post_entropy /= total_weight

        return pre_entropy - post_entropy
    def evaluate_split(self, pre_dist, post_dist):
        pre = []
        for class_value, mass in pre_dist.items():
            pre.append(pre_dist[class_value].weight)
        pre_entropy = utils.entropy(pre)

        dist_weights = []
        total_weight = 0.0
        for i in range(len(post_dist)):
            dist_weights.append(self.sum(post_dist[i]))
            total_weight += dist_weights[i]

        frac_count = 0
        for d in dist_weights:
            if d / total_weight > self._min_frac_weight_for_two_branches:
                frac_count += 1

        if frac_count < 2:
            return -math.inf

        post_entropy = 0
        for i in range(len(post_dist)):
            d = post_dist[i]
            post = []
            for class_value, mass in d.items():
                post.append(mass.weight)
            post_entropy += dist_weights[i] * utils.entropy(post)

        if total_weight > 0:
            post_entropy /= total_weight

        return pre_entropy - post_entropy
Beispiel #3
0
def evaluate(dataset, weakTokens, tokenDatabase, allTokens, insecureForms):
    done = []
    for i in dataset:
        for url, page in i.items():
            localTokens = set()
            for each in page.values():
                protected = False
                action = each['action']
                method = each['method']
                inputs = each['inputs']
                for inp in inputs:
                    name = inp['name']
                    value = inp['value']
                    if value and match(r'^[\w\-_]+$', value):
                        if entropy(value) > 10:
                            localTokens.add(value)
                            protected = True
                            break
                        elif name.lower() in commonNames:
                            weakTokens.append({url: {name: value}})
                if not protected and action not in done:
                    done.append(done)
                    insecureForms.append({url: each})
            for token in localTokens:
                allTokens.append(token)
            tokenDatabase.append({url: localTokens})
Beispiel #4
0
def extractForms(url):
    response = requester(url, {}, headers, True, 0).text
    forms = zetanize(url, response)
    for each in forms.values():
        localTokens = set()
        inputs = each['inputs']
        for inp in inputs:
            value = inp['value']
            if value and match(r'^[\w\-_]+$', value):
                if entropy(value) > 10:
                    simTokens.append(value)
Beispiel #5
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host,
                         proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
Beispiel #6
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
Beispiel #7
0
def extractor(url):
    """从响应体中提取具体的信息"""
    response = requester(
        url, main_url, delay, cook, headers, timeout, host, ninja, user_agents,
        failed, processed)  # 这里涉及到 core/requester.py 文件中的 requester() 函数

    matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response)
    for link in matches:
        # 移除"#"后的所有内容以处理页内锚点
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # 检查这些 URLs 是否应该被爬取
        if is_link(link, processed,
                   files):  # 这里涉及到 core/utils.py 文件中的 is_link() 函数
            if link[:4] == 'http':
                if link.startswith(main_url):
                    internal.add(link)
                else:
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    internal.add(schema + link)
                else:
                    external.add(link)
            elif link[:1] == '/':
                internal.add(main_url + link)
            else:
                internal.add(main_url + '/' + link)

    if not only_urls:
        intel_extractor(response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex,
              custom)  # 这里涉及到 core/utils.py 文件中的 regxy() 函数
    if api:
        matches = re.findall(r'[\w-]{16,45}', response)
        for match in matches:
            if entropy(match) >= 4:  # 这里涉及到 core/utils.py 文件中的 entropy() 函数
                keys.add(url + ': ' + match)