def init_final(self): if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443: self.base_url = f'{self.scheme}://{self.host}' elif self.scheme != 'unknown' and self.host.find(':') >= 0: self.base_url = f'{self.scheme}://{self.host}' else: self.base_url = f'{self.scheme}://{self.host}:{self.port}' if not self.has_http: logger.log( 'DEBUG', f'NO_HTTP_Scan %s:%s' % (self.host, self.port) if self.port else 'Scan %s' % self.host) # 脚本 if self.script: for _ in self.user_scripts: self.url_list.append((_, '/')) if not self.has_http or self.args.scripts_only: # 未发现HTTP服务 或 只依赖插件扫描 return # todo 当url 类似 http://www.example.com , path:'' , max_depth = 1+5=6 self.max_depth = cal_depth(self, self.path)[1] + 5 self.check_404_existence() if self._404_status == -1: logger.log('DEBUG', f'HTTP 404 check failed %s' % self.base_url) elif self._404_status != 404: logger.log( 'DEBUG', f'%s has no HTTP 404. {self._404_status}' % self.base_url) _path, _depth = cal_depth(self, self.path) # 加入队列 self.enqueue('/')
def init_final(self): if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443: self.base_url = f'{self.scheme}://{self.host}' else: # self.base_url = f'{self.scheme}://{self.host}:{self.port}' self.base_url = f'{self.scheme}://{self.host}' # if self.has_http: # logger.log('INFOR', f'Scan { self.base_url}') # else: # logger.log('INFOR', 'NO_HTTP_Scan %s:%s' % (self.host, self.port) if self.port else 'Scan %s' % self.host) # 脚本 if self.no_scripts != 1: # 不是重复目标 80 443 跳转的,不需要重复扫描 # 当前目标disable, 或者 全局开启插件扫描 if self.args.scripts_only or not self.no_scripts: for _ in self.user_scripts: self.url_list.append((_, '/')) if not self.has_http or self.args.scripts_only: # 未发现HTTP服务 或 只依赖插件扫描 return # todo 当url 类似 http://www.example.com , path:'' , max_depth = 1+5=6 self.max_depth = cal_depth(self, self.path)[1] + 5 self.check_404_existence() # if self._404_status == -1: # logger.log('ALERT', 'HTTP 404 check failed <%s:%s>' % (self.host, self.port)) # elif self._404_status != 404: # logger.log('ALERT', '%s has no HTTP 404.' % self.base_url) _path, _depth = cal_depth(self, self.path) # 加入队列 self.enqueue('/')
def crawl(self, path, do_not_process_links=False): try: status, headers, html_doc = self.http_request(path) if path == '/': self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc # 计算首页页面的 md5 值 ,通过对比 md5 值, 判断页面是否相等 self.index_md5 = hashlib.md5( self.index_html_doc.encode('utf-8')).hexdigest() if not do_not_process_links and html_doc: soup = BeautifulSoup(html_doc, "html.parser") # 循环爬取a标签 for link in soup.find_all('a'): url = link.get('href', '').strip() if url.startswith('..'): continue if not url.startswith('/') and url.find('//') < 0: # 相对路径 url = path + url url, depth = cal_depth(self, url) if depth <= self.max_depth: self.enqueue(url) # 匹配rules的白名单规则 ret = self.find_text(html_doc) if ret: if '/' not in self.results: self.results['/'] = [] m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' _ = { 'status': status, 'url': '%s%s' % (self.base_url, path), 'title': title, 'vul_type': ret[1] } if _ not in self.results['/']: self.results['/'].append(_) except Exception as e: logger.log('ERROR', '[crawl Exception] %s %s' % (path, str(e)))