def evaluate_split(self, pre_dist, post_dist): pre = [] for class_value, mass in pre_dist.items(): pre.append(pre_dist[class_value].weight) pre_entropy = utils.entropy(pre) dist_weights = [] total_weight = 0.0 for i in range(len(post_dist)): dist_weights.append(self.sum(post_dist[i])) total_weight += dist_weights[i] frac_count = 0 for d in dist_weights: if d / total_weight > self._min_frac_weight_for_two_branches: frac_count += 1 if frac_count < 2: return -np.Inf post_entropy = 0 for i in range(len(post_dist)): d = post_dist[i] post = [] for class_value, mass in d.items(): post.append(mass.weight) post_entropy += dist_weights[i] * utils.entropy(post) if total_weight > 0: post_entropy /= total_weight return pre_entropy - post_entropy
def evaluate_split(self, pre_dist, post_dist): pre = [] for class_value, mass in pre_dist.items(): pre.append(pre_dist[class_value].weight) pre_entropy = utils.entropy(pre) dist_weights = [] total_weight = 0.0 for i in range(len(post_dist)): dist_weights.append(self.sum(post_dist[i])) total_weight += dist_weights[i] frac_count = 0 for d in dist_weights: if d / total_weight > self._min_frac_weight_for_two_branches: frac_count += 1 if frac_count < 2: return -math.inf post_entropy = 0 for i in range(len(post_dist)): d = post_dist[i] post = [] for class_value, mass in d.items(): post.append(mass.weight) post_entropy += dist_weights[i] * utils.entropy(post) if total_weight > 0: post_entropy /= total_weight return pre_entropy - post_entropy
def evaluate(dataset, weakTokens, tokenDatabase, allTokens, insecureForms): done = [] for i in dataset: for url, page in i.items(): localTokens = set() for each in page.values(): protected = False action = each['action'] method = each['method'] inputs = each['inputs'] for inp in inputs: name = inp['name'] value = inp['value'] if value and match(r'^[\w\-_]+$', value): if entropy(value) > 10: localTokens.add(value) protected = True break elif name.lower() in commonNames: weakTokens.append({url: {name: value}}) if not protected and action not in done: done.append(done) insecureForms.append({url: each}) for token in localTokens: allTokens.append(token) tokenDatabase.append({url: localTokens})
def extractForms(url): response = requester(url, {}, headers, True, 0).text forms = zetanize(url, response) for each in forms.values(): localTokens = set() inputs = each['inputs'] for inp in inputs: value = inp['value'] if value and match(r'^[\w\-_]+$', value): if entropy(value) > 10: simTokens.append(value)
def extractor(url): """Extract details from the response body.""" response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) if clone: mirror(url, response) matches = rhref.findall(response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) internal.add(link) else: verb('External page', link) external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): verb('Internal page', link) internal.add(schema + '://' + link) else: verb('External page', link) external.add(link) elif link[:1] == '/': verb('Internal page', link) internal.add(remove_file(url) + link) else: verb('Internal page', link) usable_url = remove_file(url) if usable_url.endswith('/'): internal.add(usable_url + link) elif link.startswith('/'): internal.add(usable_url + link) else: internal.add(usable_url + '/' + link) if not only_urls: intel_extractor(url, response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = rentropy.findall(response) for match in matches: if entropy(match) >= 4: verb('Key', match) keys.add(url + ': ' + match)
def extractor(url): """从响应体中提取具体的信息""" response = requester( url, main_url, delay, cook, headers, timeout, host, ninja, user_agents, failed, processed) # 这里涉及到 core/requester.py 文件中的 requester() 函数 matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response) for link in matches: # 移除"#"后的所有内容以处理页内锚点 link = link[1].replace('\'', '').replace('"', '').split('#')[0] # 检查这些 URLs 是否应该被爬取 if is_link(link, processed, files): # 这里涉及到 core/utils.py 文件中的 is_link() 函数 if link[:4] == 'http': if link.startswith(main_url): internal.add(link) else: external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): internal.add(schema + link) else: external.add(link) elif link[:1] == '/': internal.add(main_url + link) else: internal.add(main_url + '/' + link) if not only_urls: intel_extractor(response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) # 这里涉及到 core/utils.py 文件中的 regxy() 函数 if api: matches = re.findall(r'[\w-]{16,45}', response) for match in matches: if entropy(match) >= 4: # 这里涉及到 core/utils.py 文件中的 entropy() 函数 keys.add(url + ': ' + match)