Ejemplo n.º 1
0
    def init_final(self):
        if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443:
            self.base_url = f'{self.scheme}://{self.host}'
        elif self.scheme != 'unknown' and self.host.find(':') >= 0:
            self.base_url = f'{self.scheme}://{self.host}'
        else:
            self.base_url = f'{self.scheme}://{self.host}:{self.port}'

        if not self.has_http:
            logger.log(
                'DEBUG', f'NO_HTTP_Scan %s:%s' %
                (self.host, self.port) if self.port else 'Scan %s' % self.host)

        # 脚本
        if self.script:
            for _ in self.user_scripts:
                self.url_list.append((_, '/'))

        if not self.has_http or self.args.scripts_only:  # 未发现HTTP服务 或  只依赖插件扫描
            return

        # todo  当url 类似 http://www.example.com , path:'' , max_depth = 1+5=6
        self.max_depth = cal_depth(self, self.path)[1] + 5

        self.check_404_existence()

        if self._404_status == -1:
            logger.log('DEBUG', f'HTTP 404 check failed %s' % self.base_url)
        elif self._404_status != 404:
            logger.log(
                'DEBUG',
                f'%s has no HTTP 404.  {self._404_status}' % self.base_url)

        _path, _depth = cal_depth(self, self.path)

        # 加入队列
        self.enqueue('/')
Ejemplo n.º 2
0
    def init_final(self):
        if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443:
            self.base_url = f'{self.scheme}://{self.host}'
        else:
            # self.base_url = f'{self.scheme}://{self.host}:{self.port}'
            self.base_url = f'{self.scheme}://{self.host}'

        # if self.has_http:
        #     logger.log('INFOR', f'Scan { self.base_url}')
        # else:
        #     logger.log('INFOR', 'NO_HTTP_Scan %s:%s' % (self.host, self.port) if self.port else 'Scan %s' % self.host)

        # 脚本
        if self.no_scripts != 1:  # 不是重复目标 80 443 跳转的,不需要重复扫描
            # 当前目标disable, 或者 全局开启插件扫描
            if self.args.scripts_only or not self.no_scripts:
                for _ in self.user_scripts:
                    self.url_list.append((_, '/'))

        if not self.has_http or self.args.scripts_only:  # 未发现HTTP服务 或  只依赖插件扫描
            return

        # todo  当url 类似 http://www.example.com , path:'' , max_depth = 1+5=6
        self.max_depth = cal_depth(self, self.path)[1] + 5

        self.check_404_existence()

        # if self._404_status == -1:
        #     logger.log('ALERT', 'HTTP 404 check failed <%s:%s>' % (self.host, self.port))
        # elif self._404_status != 404:
        #     logger.log('ALERT', '%s has no HTTP 404.' % self.base_url)

        _path, _depth = cal_depth(self, self.path)

        # 加入队列
        self.enqueue('/')
Ejemplo n.º 3
0
    def crawl(self, path, do_not_process_links=False):
        try:
            status, headers, html_doc = self.http_request(path)

            if path == '/':
                self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc
                # 计算首页页面的 md5 值 ,通过对比 md5 值, 判断页面是否相等
                self.index_md5 = hashlib.md5(
                    self.index_html_doc.encode('utf-8')).hexdigest()

            if not do_not_process_links and html_doc:
                soup = BeautifulSoup(html_doc, "html.parser")
                # 循环爬取a标签
                for link in soup.find_all('a'):
                    url = link.get('href', '').strip()
                    if url.startswith('..'):
                        continue
                    if not url.startswith('/') and url.find('//') < 0:  # 相对路径
                        url = path + url
                    url, depth = cal_depth(self, url)

                    if depth <= self.max_depth:
                        self.enqueue(url)
                # 匹配rules的白名单规则
                ret = self.find_text(html_doc)
                if ret:
                    if '/' not in self.results:
                        self.results['/'] = []
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, path),
                        'title': title,
                        'vul_type': ret[1]
                    }
                    if _ not in self.results['/']:
                        self.results['/'].append(_)

        except Exception as e:
            logger.log('ERROR', '[crawl Exception] %s %s' % (path, str(e)))