def extractor(url): """Extract details from the response body.""" response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) if clone: mirror(url, response) matches = rhref.findall(response) for link in matches: # Remove everything after a "#" to deal with in-page anchors link = link[1].replace('\'', '').replace('"', '').split('#')[0] # Checks if the URLs should be crawled if is_link(link, processed, files): if link[:4] == 'http': if link.startswith(main_url): verb('Internal page', link) internal.add(link) else: verb('External page', link) external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): verb('Internal page', link) internal.add(schema + '://' + link) else: verb('External page', link) external.add(link) elif link[:1] == '/': verb('Internal page', link) internal.add(remove_file(url) + link) else: verb('Internal page', link) usable_url = remove_file(url) if usable_url.endswith('/'): internal.add(usable_url + link) elif link.startswith('/'): internal.add(usable_url + link) else: internal.add(usable_url + '/' + link) if not only_urls: intel_extractor(url, response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) if api: matches = rentropy.findall(response) for match in matches: if entropy(match) >= 4: verb('Key', match) keys.add(url + ': ' + match)
def extractor(url): """从响应体中提取具体的信息""" response = requester( url, main_url, delay, cook, headers, timeout, host, ninja, user_agents, failed, processed) # 这里涉及到 core/requester.py 文件中的 requester() 函数 matches = re.findall(r'<[aA].*(href|HREF)=([^\s>]+)', response) for link in matches: # 移除"#"后的所有内容以处理页内锚点 link = link[1].replace('\'', '').replace('"', '').split('#')[0] # 检查这些 URLs 是否应该被爬取 if is_link(link, processed, files): # 这里涉及到 core/utils.py 文件中的 is_link() 函数 if link[:4] == 'http': if link.startswith(main_url): internal.add(link) else: external.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): internal.add(schema + link) else: external.add(link) elif link[:1] == '/': internal.add(main_url + link) else: internal.add(main_url + '/' + link) if not only_urls: intel_extractor(response) js_extractor(response) if args.regex and not supress_regex: regxy(args.regex, response, supress_regex, custom) # 这里涉及到 core/utils.py 文件中的 regxy() 函数 if api: matches = re.findall(r'[\w-]{16,45}', response) for match in matches: if entropy(match) >= 4: # 这里涉及到 core/utils.py 文件中的 entropy() 函数 keys.add(url + ': ' + match)