class Scanner(object): def __init__(self, q_results, timeout=600, args=None): self.q_results = q_results self.args = args self.start_time = time.time() self.time_out = timeout self.links_limit = 100 # max number of folders to scan self._init_rules() self._init_scripts() self.url_queue = Queue.Queue() # all urls to scan self.urls_processed = set() # processed urls self.urls_enqueued = set() # entered queue urls self.urls_crawled = set() self.lock = threading.Lock() self.results = {} self.log_file = None self._404_status = -1 self.conn_pool = None self.index_status, self.index_headers, self.index_html_doc = None, {}, '' self.scheme, self.host, self.port, self.path = None, None, None, None self.domain_sub = '' self.base_url = '' self.max_depth = 0 self.len_404_doc = 0 self.has_http = None self.ports_open = None self.ports_closed = None self.no_scripts = None self.status_502_count = 0 def print_msg(self, msg): self.q_results.put(msg) def reset_scanner(self): self.start_time = time.time() self.url_queue.queue.clear() self.urls_processed.clear() self.urls_enqueued.clear() self.urls_crawled.clear() self.results.clear() self.log_file = None self._404_status = -1 self.conn_pool = None self.index_status, self.index_headers, self.index_html_doc = None, {}, '' self.scheme, self.host, self.port, self.path = None, None, None, None self.domain_sub = '' self.base_url = '' self.status_502_count = 0 # scan from a given URL def init_from_url(self, target): self.reset_scanner() self.scheme = target['scheme'] self.host = target['host'] self.port = target['port'] self.path = target['path'] self.has_http = target['has_http'] self.ports_open = target['ports_open'] self.ports_closed = target['ports_closed'] self.no_scripts = target['no_scripts'] if 'no_scripts' in target else 0 self.domain_sub = get_domain_sub(self.host) self.init_final() return True def init_from_log_file(self, log_file): self.reset_scanner() self.log_file = log_file self.scheme, self.host, self.path = self._parse_url_from_file() self.domain_sub = get_domain_sub(self.host) if self.host: if self.host.find(':') > 0: _ret = self.host.split(':') self.host = _ret[0] self.port = _ret[1] elif self.scheme == 'https': self.port = 443 elif self.scheme == 'http': self.port = 80 else: self.port = None if not is_port_open(self.host, self.port): self.print_msg('[Port Not Open] %s:%s' % (self.host, self.port)) return False self.has_http = True self.no_scripts = 1 self.init_final() self.load_all_urls_from_log_file() return True else: host = os.path.basename(log_file).replace('.log', '') try: socket.gethostbyname(host) self.init_from_url(host) # Fix Me return True except Exception as e: self.print_msg('[ERROR] Invalid host from log name: %s' % host) return False def init_final(self): try: if self.conn_pool: self.conn_pool.close() except Exception as e: pass if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443: self.base_url = '%s://%s' % (self.scheme, self.host) else: self.base_url = '%s://%s:%s' % (self.scheme, self.host, self.port) if self.has_http: self.print_msg('Scan %s' % self.base_url) else: self.print_msg('Scan %s:%s' % (self.host, self.port) if self.port else 'Scan %s' % self.host) if self.has_http: if self.scheme == 'https': self.conn_pool = HTTPSConnPool(self.host, port=self.port, maxsize=self.args.t, headers=config.default_headers) else: self.conn_pool = HTTPConnPool(self.host, port=self.port, maxsize=self.args.t, headers=config.default_headers) if self.args.require_index_doc: self.crawl('/', do_not_process_links=True) if self.no_scripts != 1: # 不是重复目标 80 443 跳转的,不需要重复扫描 # 当前目标disable, 或者 全局开启插件扫描 if self.args.scripts_only or not self.no_scripts: for _ in self.user_scripts: self.url_queue.put((_, '/')) if not self.has_http or self.args.scripts_only: # 未发现HTTP服务 或 只依赖插件扫描 return self.max_depth = cal_depth(self, self.path)[1] + 5 if self.args.no_check404: self._404_status = 404 else: self.check_404_existence() if self._404_status == -1: self.print_msg('[Warning] HTTP 404 check failed <%s:%s>' % (self.host, self.port)) elif self._404_status != 404: self.print_msg('[Warning] %s has no HTTP 404.' % self.base_url) _path, _depth = cal_depth(self, self.path) self.enqueue('/') if _path != '/' and not self.log_file: self.enqueue(_path) # def _parse_url_from_file(self): url = '' with open(self.log_file) as infile: for _line in infile.xreadlines(): _line = _line.strip() if _line and len(_line.split()) >= 3: url = _line.split()[1] break return parse_url(url) # load urls from rules/*.txt def _init_rules(self): self.text_to_find = [] self.regex_to_find = [] self.text_to_exclude = [] self.regex_to_exclude = [] self.rules_set = set() self.rules_set_root_only = set() p_tag = re.compile('{tag="(.*?)"}') p_status = re.compile(r'{status=(\d{3})}') p_content_type = re.compile('{type="(.*?)"}') p_content_type_no = re.compile('{type_no="(.*?)"}') _files = self.args.rule_files if self.args.rule_files else glob.glob('rules/*.txt') for rule_file in _files: with open(rule_file, 'r') as infile: vul_type = os.path.basename(rule_file)[:-4] for url in infile.xreadlines(): url = url.strip() if url.startswith('/'): _ = p_tag.search(url) tag = _.group(1) if _ else '' _ = p_status.search(url) status = int(_.group(1)) if _ else 0 _ = p_content_type.search(url) content_type = _.group(1) if _ else '' _ = p_content_type_no.search(url) content_type_no = _.group(1) if _ else '' root_only = True if url.find('{root_only}') >= 0 else False rule = (url.split()[0], tag, status, content_type, content_type_no, root_only, vul_type) if root_only: if rule not in self.rules_set_root_only: self.rules_set_root_only.add(rule) else: self.print_msg('Duplicated root only rule: %s' % str(rule)) else: if rule not in self.rules_set: self.rules_set.add(rule) else: self.print_msg('Duplicated rule: %s' % str(rule)) re_text = re.compile('{text="(.*)"}') re_regex_text = re.compile('{regex_text="(.*)"}') file_path = 'rules/white.list' if not os.path.exists(file_path): self.print_msg('[ERROR] File not exist: %s' % file_path) return for _line in open(file_path): _line = _line.strip() if not _line or _line.startswith('#'): continue _m = re_text.search(_line) if _m: self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore')) else: _m = re_regex_text.search(_line) if _m: self.regex_to_find.append(re.compile(_m.group(1).decode('utf-8', 'ignore'))) file_path = 'rules/black.list' if not os.path.exists(file_path): self.print_msg('[ERROR] File not exist: %s' % file_path) return for _line in open(file_path): _line = _line.strip() if not _line or _line.startswith('#'): continue _m = re_text.search(_line) if _m: self.text_to_exclude.append(_m.group(1).decode('utf-8', 'ignore')) else: _m = re_regex_text.search(_line) if _m: self.regex_to_exclude.append(re.compile(_m.group(1).decode('utf-8', 'ignore'))) def _init_scripts(self): self.user_scripts = [] if self.args.no_scripts: # 全局禁用插件,无需导入 return for _script in glob.glob('scripts/*.py'): script_name_origin = os.path.basename(_script) script_name = script_name_origin.replace('.py', '') if self.args.script: # 只导入指定的脚本 if script_name not in self.args.script and script_name_origin not in self.args.script: continue if script_name.startswith('_'): continue try: self.user_scripts.append(importlib.import_module('scripts.%s' % script_name)) except Exception as e: self.print_msg('[ERROR] Fail to load script %s' % script_name) def http_request(self, url, headers=config.default_headers, timeout=20): try: if not url: url = '/' if not self.conn_pool: return -1, {}, '' if self.args.debug: self.print_msg('--> %s' % self.base_url + url) resp = self.conn_pool.urlopen('GET', self.base_url + url, headers=headers, assert_same_host=False, redirect=False, timeout=timeout, retries=0) if resp.headers.get('content-type', '').find('text') >= 0 \ or resp.headers.get('content-type', '').find('html') >= 0 \ or int(resp.headers.get('content-length', '0')) <= 20480: # 1024 * 20 html_doc = decode_response_text(resp.data) else: html_doc = '' if resp.status == 502: # 502出现3次以上,排除该站点 self.status_502_count += 1 if self.status_502_count > 3: self.url_queue.queue.clear() try: if self.conn_pool: self.conn_pool.close() except Exception as e: pass self.conn_pool = None # self.print_msg('Website 502: %s' % self.base_url) return resp.status, resp.headers, html_doc except urllib3.exceptions.MaxRetryError as e: return -1, {}, '' except TypeError as e: return -1, {}, '' except Exception as e: self.print_msg(str(e)) return -1, {}, '' # check existence of status 404 def check_404_existence(self): try: try: self._404_status, _, html_doc = self.http_request('/BBScan-404-existence-check') except Exception as e: self.print_msg('[Warning] HTTP 404 check failed: %s' % self.base_url) self._404_status, _, html_doc = -1, {}, '' if self._404_status != 404: self.len_404_doc = len(html_doc) except Exception as e: self.print_msg('[Check_404] Exception %s %s' % (self.base_url, str(e))) # def enqueue(self, url): try: url = str(url) except Exception as e: return False try: url_pattern = re.sub(r'\d+', '{num}', url) if url_pattern in self.urls_processed or len(self.urls_processed) >= self.links_limit: return False self.urls_processed.add(url_pattern) # self.print_msg('Entered Queue: %s' % url) if not self.args.no_crawl: # no crawl self.crawl(url) if self._404_status != -1: # valid web service rule_set_to_process = [self.rules_set, self.rules_set_root_only] if url == '/' else [self.rules_set] for rule_set in rule_set_to_process: for _ in rule_set: if _[5] and url != '/': # root only continue try: full_url = url.rstrip('/') + _[0] except Exception as e: continue if full_url in self.urls_enqueued: continue url_description = {'prefix': url.rstrip('/'), 'full_url': full_url} item = (url_description, _[1], _[2], _[3], _[4], _[5], _[6]) self.url_queue.put(item) self.urls_enqueued.add(full_url) if self.args.full_scan and url.count('/') >= 2: self.enqueue('/'.join(url.split('/')[:-2]) + '/') # sub folder enqueue if url != '/'and not self.no_scripts: for script in self.user_scripts: self.url_queue.put((script, url)) return True except Exception as e: self.print_msg('[_enqueue.exception] %s' % str(e)) return False # def crawl(self, path, do_not_process_links=False): try: # increase body size to 200 KB headers = dict(config.default_headers, Range='bytes=0-204800') status, headers, html_doc = self.http_request(path, headers=headers) if path == '/': self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc if not self.args.no_crawl and not do_not_process_links and html_doc: soup = BeautifulSoup(html_doc, "html.parser") for link in soup.find_all('a'): url = link.get('href', '').strip() if url.startswith('..'): continue if not url.startswith('/') and url.find('//') < 0: # relative path url = path + url url, depth = cal_depth(self, url) # print url, depth if depth <= self.max_depth: self.enqueue(url) # ret = self.find_text(html_doc) if ret: if '/' not in self.results: self.results['/'] = [] m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' _ = {'status': status, 'url': '%s%s' % (self.base_url, path), 'title': title, 'vul_type': ret[1]} if _ not in self.results['/']: self.results['/'].append(_) except Exception as e: self.print_msg('[crawl Exception] %s %s' % (path, str(e))) # def load_all_urls_from_log_file(self): try: with open(self.log_file) as infile: for _line in infile.xreadlines(): _ = _line.strip().split() if len(_) == 3 and (_[2].find('^^^200') > 0 or _[2].find('^^^403') > 0 or _[2].find('^^^302') > 0): url, depth = cal_depth(self, _[1]) self.enqueue(url) except Exception as e: self.print_msg('[load_all_urls_from_log_file] %s' % str(e)) # def find_text(self, html_doc): for _text in self.text_to_find: if html_doc.find(_text) >= 0: return True, 'Found [%s]' % _text for _regex in self.regex_to_find: if _regex.search(html_doc): return True, 'Found Regex [%s]' % _regex.pattern return False # def find_exclude_text(self, html_doc): for _text in self.text_to_exclude: if html_doc.find(_text) >= 0: return True for _regex in self.regex_to_exclude: if _regex.search(html_doc): return True return False # def scan_worker(self): while True: if time.time() - self.start_time > self.time_out: self.url_queue.queue.clear() self.print_msg('[ERROR] Timed out task: %s' % self.base_url) return try: item = self.url_queue.get(timeout=0.1) except Exception as e: return try: if len(item) == 2: # Script Scan check_func = getattr(item[0], 'do_check') # self.print_msg('Begin %s %s' % (os.path.basename(item[0].__file__), item[1])) check_func(self, item[1]) # self.print_msg('End %s %s' % (os.path.basename(item[0].__file__), item[1])) continue else: url_description, tag, status_to_match, content_type, content_type_no, root_only, vul_type = item prefix = url_description['prefix'] url = url_description['full_url'] if url.find('{sub}') >= 0: if not self.domain_sub: continue url = url.replace('{sub}', self.domain_sub) except Exception as e: self.print_msg('[scan_worker.1] %s' % str(e)) self.print_msg(traceback.format_exc()) continue if not item or not url: break try: status, headers, html_doc = self.http_request(url) cur_content_type = headers.get('content-type', '') cur_content_length = headers.get('content-length', len(html_doc)) if self.find_exclude_text(html_doc): # excluded text found continue if 0 <= int(cur_content_length) <= 10: # text too short continue if cur_content_type.find('image/') >= 0: # exclude image continue if content_type != 'application/json' and cur_content_type.find('application/json') >= 0 and \ not url.endswith('.json'): # invalid json continue if content_type and cur_content_type.find(content_type) < 0 \ or content_type_no and cur_content_type.find(content_type_no) >= 0: continue # content type mismatch if tag and html_doc.find(tag) < 0: continue # tag mismatch if self.find_text(html_doc): valid_item = True else: # status code check if status_to_match == 206 and status != 206: continue if status_to_match in (200, 206) and status in (200, 206): valid_item = True elif status_to_match and status != status_to_match: continue elif status in (403, 404) and status != status_to_match: continue else: valid_item = True if status == self._404_status and url != '/': len_doc = len(html_doc) len_sum = self.len_404_doc + len_doc if len_sum == 0 or (0.4 <= float(len_doc) / len_sum <= 0.6): continue if valid_item: m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' self.lock.acquire() # self.print_msg('[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host + url)) if prefix not in self.results: self.results[prefix] = [] _ = {'status': status, 'url': '%s%s' % (self.base_url, url), 'title': title, 'vul_type': vul_type} if _ not in self.results[prefix]: self.results[prefix].append(_) self.lock.release() except Exception as e: self.print_msg('[scan_worker.2][%s] %s' % (url, str(e))) traceback.print_exc() # def scan(self, threads=6): try: all_threads = [] for i in range(threads): t = threading.Thread(target=self.scan_worker) t.start() all_threads.append(t) for t in all_threads: t.join() for key in self.results.keys(): # Over 5 URLs found under this folder, keep the first one only if len(self.results[key]) > 5: self.results[key] = self.results[key][:1] return self.base_url.lstrip('unknown://').rstrip(':None'), self.results except Exception as e: self.print_msg('[scan exception] %s' % str(e)) self.conn_pool.close()
class InfoDisScanner(object): def __init__(self, timeout=600, args=None): self.args = args self.START_TIME = time.time() self.TIME_OUT = timeout self.LINKS_LIMIT = 100 # max number of Folders to scan self.full_scan = args.full_scan self._init_rules() self._init_scripts() self.url_queue = Queue.Queue() # all urls to scan self.urls_processed = set() # processed urls self.urls_enqueued = set() # entered queue urls self.lock = threading.Lock() # reset scanner def init_reset(self): self.START_TIME = time.time() self.url_queue.queue.clear() self.urls_processed = set() self.urls_enqueued = set() self.index_a_urls = set() self.scripts_enqueued = set() self.results = {} self.log_file = None self._404_status = -1 self.conn_pool = None self.index_status, self.index_headers, self.index_html_doc = None, {}, '' self.rewrite = False self.server = '' self.lang = '' # scan from a given URL def init_from_url(self, url): self.init_reset() if not url.find('://') > 0: self.url = 'http://' + url else: self.url = url self.schema, self.host, self.path = parse_url(url) self.domain_sub = get_domain_sub(self.host) self.init_final() def init_from_log_file(self, log_file): self.init_reset() self.log_file = log_file self.schema, self.host, self.path = self._parse_url_from_file() self.domain_sub = get_domain_sub(self.host) if self.host: self.load_all_urls_from_log_file() self.init_final() else: self.init_from_url(os.path.basename(log_file).replace('.log', '')) # def init_final(self): try: self.conn_pool.close() except: pass default_port = 443 if self.schema.lower() == 'https' else 80 self.host, self.port = self.host.split( ':') if self.host.find(':') > 0 else (self.host, default_port) self.port = int(self.port) if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443: self.base_url = '%s://%s' % (self.schema, self.host) else: self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port) is_port_open = self.is_port_open() if is_port_open: if self.schema == 'https': self.conn_pool = HTTPSConnPool(self.host, port=self.port, maxsize=self.args.t * 2, headers=headers) else: self.conn_pool = HTTPConnPool(self.host, port=self.port, maxsize=self.args.t * 2, headers=headers) if not is_port_open: return self.max_depth = cal_depth(self, self.path)[1] + 5 if self.args.no_check404: self._404_status = 404 self.has_404 = True else: self.check_404() # check existence of HTTP 404 if not self.has_404: print_msg('[Warning] %s has no HTTP 404.' % self.host) self.request_index(self.path) self.gather_info() _path, _depth = cal_depth(self, self.path) self._enqueue('/') self._enqueue(_path) if not self.args.no_crawl and not self.log_file: self.crawl_index() def is_port_open(self): try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(5.0) if s.connect_ex((self.host, int(self.port))) == 0: self.lock.acquire() print_msg('Scan web: %s' % self.base_url) self.lock.release() return True else: print_msg('[Warning] Fail to connect to %s:%s' % (self.host, self.port)) return False except Exception as e: return False finally: s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, struct.pack('ii', 1, 0)) s.close() # def _parse_url_from_file(self): url = '' with open(self.log_file) as infile: for line in infile.xreadlines(): line = line.strip() if line and len(line.split()) >= 3: url = line.split()[1] break return parse_url(url) def _load_rules(self, rule_file): rules = [] p_tag = re.compile('{tag="(.*?)"}') p_status = re.compile('{status=(\d{3})}') p_content_type = re.compile('{type="(.*?)"}') p_content_type_no = re.compile('{type_no="(.*?)"}') p_lang = re.compile('{lang="(.*?)"}') with open(rule_file, 'r') as infile: for url in infile.xreadlines(): url = url.strip() if url.startswith('/'): _ = p_tag.search(url) tag = _.group(1) if _ else '' _ = p_status.search(url) status = int(_.group(1)) if _ else 0 _ = p_content_type.search(url) content_type = _.group(1) if _ else '' _ = p_content_type_no.search(url) content_type_no = _.group(1) if _ else '' _ = p_lang.search(url) lang = _.group(1) if _ else '' root_only = True if url.find('{root_only}') >= 0 else False rewrite = True if url.find('{rewrite}') >= 0 else False rule = (url.split()[0], tag, status, content_type, content_type_no, root_only, lang, rewrite) rules.append(rule) return rules # # load urls from rules/*.txt def _init_rules(self): self.text_to_find = [] self.regex_to_find = [] self.text_to_exclude = [] self.regex_to_exclude = [] self.rules_set = set() for rule_file in glob.glob('rules/*.txt'): rules = self._load_rules(rule_file) for rule in rules: if rule not in self.rules_set: self.rules_set.add(rule) else: print 'Dumplicated Rule:', rule re_text = re.compile('{text="(.*)"}') re_regex_text = re.compile('{regex_text="(.*)"}') _file_path = 'rules/white.list' if not os.path.exists(_file_path): return for line in open(_file_path): line = line.strip() if not line or line.startswith('#'): continue _m = re_text.search(line) if _m: self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore')) else: _m = re_regex_text.search(line) if _m: self.regex_to_find.append( re.compile(_m.group(1).decode('utf-8', 'ignore'))) _file_path = 'rules/black.list' if not os.path.exists(_file_path): return for line in open(_file_path): line = line.strip() if not line or line.startswith('#'): continue _m = re_text.search(line) if _m: self.text_to_exclude.append( _m.group(1).decode('utf-8', 'ignore')) else: _m = re_regex_text.search(line) if _m: self.regex_to_exclude.append( re.compile(_m.group(1).decode('utf-8', 'ignore'))) # def _init_scripts(self): self.user_scripts = [] if self.args.no_scripts: # disable user scripts scan return for _script in glob.glob('scripts/*.py'): script_name = os.path.basename(_script).replace('.py', '') if script_name.startswith('_'): continue try: _ = importlib.import_module('scripts.%s' % script_name) self.user_scripts.append(_) except Exception as e: print e # def _http_request(self, url, timeout=30): try: if not url: url = '/' # print 'request', self.base_url + url resp = self.conn_pool.urlopen('GET', self.base_url + url, redirect=False, timeout=timeout, retries=0) resp_headers = resp.headers status = resp.status if resp_headers.get('content-type', '').find('text') >= 0 \ or resp_headers.get('content-type', '').find('html') >= 0 \ or int(resp_headers.get('content-length', '0')) <= 20480: # 1024 * 20 html_doc = decode_response_text(resp.data) else: html_doc = '' return status, resp_headers, html_doc except Exception as e: return -1, {}, '' # def check_404(self): try: try: self._404_status, headers, html_doc = self._http_request( '/BBScan-404-existence-check') except: self._404_status, headers, html_doc = -1, {}, '' self.has_404 = (self._404_status == 404) if not self.has_404: self.len_404_doc = len(html_doc) return self.has_404 except Exception as e: logging.error('[Check_404] Exception %s' % str(e)) def _enqueue_request(self, prefix, full_url, rule): if self.args.scripts_only: return if full_url in self.urls_enqueued: return url_description = {'prefix': prefix, 'full_url': full_url} item = (url_description, rule[1], rule[2], rule[3], rule[4], rule[5], rule[6], rule[7]) self.url_queue.put(item) self.urls_enqueued.add(full_url) def _enqueue_script(self, module, prefix): if self.args.no_scripts: return if not prefix: prefix = '/' if (module.__name__, prefix) in self.scripts_enqueued: return self.url_queue.put((module, prefix)) self.scripts_enqueued.add((module.__name__, prefix)) # def _enqueue(self, url): try: url = str(url) url_pattern = re.sub('\d+', '{num}', url) if url_pattern in self.urls_processed or len( self.urls_processed) >= self.LINKS_LIMIT: return False else: self.urls_processed.add(url_pattern) # print 'Entered Queue:', url for _ in self.rules_set: # rewrite & lang check if self.rewrite and not _[7]: continue elif self.lang and self.lang != 'unknown': if _[6] and self.lang != _[6]: continue # root_only if _[5] and url != '/': continue full_url = url.rstrip('/') + _[0] self._enqueue_request(url.rstrip('/'), full_url, _) if self.full_scan and url.count('/') >= 2: self._enqueue('/'.join(url.split('/')[:-2]) + '/') # sub folder enqueue for _ in self.user_scripts: self._enqueue_script(_, url.rstrip('/')) return True except Exception as e: print '[_enqueue.exception] %s' % str(e) return False # def request_index(self, path): try: status, headers, html_doc = self._http_request(path) if status != 200: try: html_doc = self.conn_pool.urlopen( 'GET', self.url, headers=headers_without_range, retries=1).data html_doc = decode_response_text(html_doc) except Exception as e: pass self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc # save index content soup = BeautifulSoup(self.index_html_doc, "html.parser") for link in soup.find_all('a'): url = link.get('href', '').strip() self.index_a_urls.add(url) except Exception as e: logging.error('[request_index Exception] %s' % str(e)) traceback.print_exc() def gather_info(self): if not self.server: self.server = check_server(self.index_headers.get('server', '')) if not self.lang: self.lang, self.framework = check_lang(self.base_url, self.index_headers) if self.lang == 'unknown': for url in self.index_a_urls: url, depth = cal_depth(self, url) lang = check_lang_url(url) if lang != 'unknown': self.lang = lang break self.rewrite = check_rewrite(self.server, self.lang) def crawl_index(self): for url in self.index_a_urls: url, depth = cal_depth(self, url) if depth <= self.max_depth: self._enqueue(url) if self.find_text(self.index_html_doc): self.results['/'] = [] m = re.search('<title>(.*?)</title>', self.index_html_doc) title = m.group(1) if m else '' _ = { 'status': self.index_status, 'url': '%s%s' % (self.base_url, self.path), 'title': title } if _ not in self.results['/']: self.results['/'].append(_) # def load_all_urls_from_log_file(self): try: with open(self.log_file) as inFile: for line in inFile.xreadlines(): _ = line.strip().split() if len(_) == 3 and (_[2].find('^^^200') > 0 or _[2].find('^^^403') > 0 or _[2].find('^^^302') > 0): url, depth = cal_depth(self, _[1]) self._enqueue(url) except Exception as e: logging.error('[load_all_urls_from_log_file Exception] %s' % str(e)) traceback.print_exc() # def find_text(self, html_doc): for _text in self.text_to_find: if html_doc.find(_text) > 0: return True for _regex in self.regex_to_find: if _regex.search(html_doc) > 0: return True return False # def find_exclude_text(self, html_doc): for _text in self.text_to_exclude: if html_doc.find(_text) >= 0: return True for _regex in self.regex_to_exclude: if _regex.search(html_doc): return True return False def apply_rules(self, item): url_description, tag, status_to_match, content_type, content_type_no, root_only, lang, rewrite = item prefix = url_description['prefix'] url = url_description['full_url'] # print url url = url.replace('{sub}', self.domain_sub) if url.find('{hostname_or_folder}') >= 0: _url = url[:url.find('{hostname_or_folder}')] folders = _url.split('/') for _folder in reversed(folders): if _folder not in ['', '.', '..']: url = url.replace('{hostname_or_folder}', _folder) break url = url.replace('{hostname_or_folder}', self.domain_sub) url = url.replace('{hostname}', self.domain_sub) if not item or not url: return False, None, None, None # print '[%s]' % url.strip() try: status, headers, html_doc = self._http_request(url) cur_content_type = headers.get('content-type', '') if self.find_exclude_text(html_doc): # excluded text found return False, status, headers, html_doc if ('html' in cur_content_type or 'text' in cur_content_type) and \ 0 <= len(html_doc) <= 10: # text too short return False, status, headers, html_doc if cur_content_type.find('image/') >= 0: # exclude image return False, status, headers, html_doc valid_item = False if self.find_text(html_doc): valid_item = True else: if cur_content_type.find( 'application/json') >= 0 and not url.endswith( '.json'): # no json return False, status, headers, html_doc if status != status_to_match and status != 206: # status in [301, 302, 400, 404, 501, 502, 503, 505] return False, status, headers, html_doc if tag: if html_doc.find(tag) >= 0: valid_item = True else: return False, status, headers, html_doc # tag mismatch if (content_type and cur_content_type.find(content_type) < 0) \ or (content_type_no and cur_content_type.find(content_type_no) >= 0): return False, status, headers, html_doc # type mismatch if self.has_404 or status != self._404_status: if status_to_match in (200, 206) and status == 206: valid_item = True elif status_to_match and status != status_to_match: # status mismatch return False, status, headers, html_doc elif status_to_match != 403 and status == 403: return False, status, headers, html_doc else: valid_item = True if not self.has_404 and status in ( 200, 206) and url != '/' and not tag: _len = len(html_doc) _min = min(_len, self.len_404_doc) if _min == 0: _min = 10.0 if float(_len - self.len_404_doc) / _min > 0.3: valid_item = True if status == 206 and tag == '' and cur_content_type.find( 'text') < 0 and cur_content_type.find('html') < 0: valid_item = True return valid_item, status, headers, html_doc except Exception as e: logging.error('[_scan_worker.Exception][3][%s] %s' % (url, str(e))) traceback.print_exc() # def _scan_worker(self): while self.url_queue.qsize() > 0: if time.time() - self.START_TIME > self.TIME_OUT: self.url_queue.queue.clear() print_msg('[ERROR] Timed out task: %s' % self.host) return try: item = self.url_queue.get(timeout=0.1) except Exception as e: print e return try: if len(item) == 2: # User Script check_func = getattr(item[0], 'do_check') check_func(self, item[1]) continue except Exception as e: logging.error('[_scan_worker Exception] [1] %s' % str(e)) traceback.print_exc() continue url_description, tag, status_to_match, content_type, content_type_no, root_only, lang, rewrite = item prefix = url_description['prefix'] url = url_description['full_url'] valid_item, status, headers, html_doc = self.apply_rules(item) try: if valid_item: m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' self.lock.acquire() # print '[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host + url) if prefix not in self.results: self.results[prefix] = [] _ = { 'status': status, 'url': '%s%s' % (self.base_url, url), 'title': title } if _ not in self.results[prefix]: self.results[prefix].append(_) self.lock.release() if len(self.results) >= 10: print '[Warning] Over 10 vulnerabilities found [%s], seems to be false positives.' % prefix self.url_queue.queue.clear() except Exception as e: logging.error('[_scan_worker.Exception][2][%s] %s' % (url, str(e))) traceback.print_exc() # def scan(self, threads=6): try: all_threads = [] for i in range(threads): t = threading.Thread(target=self._scan_worker) t.start() all_threads.append(t) for t in all_threads: t.join() ''' for key in self.results.keys(): if len(self.results[key]) > 5: # Over 5 URLs found under this folder, show first only self.results[key] = self.results[key][:1] ''' return '%s:%s' % (self.host, self.port), self.results except Exception as e: print '[scan exception] %s' % str(e) self.conn_pool.close()
class Scanner(object): def __init__(self, timeout=600, args=None): self.args = args self.start_time = time.time() self.time_out = timeout self.links_limit = 100 # max number of folders to scan self._init_rules() self._init_scripts() self.url_queue = Queue.Queue() # all urls to scan self.urls_processed = set() # processed urls self.urls_enqueued = set() # entered queue urls self.urls_crawled = set() self.lock = threading.Lock() self.results = {} self.log_file = None self._404_status = -1 self.conn_pool = None self.index_status, self.index_headers, self.index_html_doc = None, {}, '' self.url = '' self.schema, self.host, self.port, self.path = None, None, None, None self.domain_sub = self.base_url = '' self.has_status_404 = True self.max_depth = 0 self.len_404_doc = 0 # reset scanner def reset_scanner(self): self.start_time = time.time() self.url_queue.queue.clear() self.urls_processed.clear() self.urls_enqueued.clear() self.urls_crawled.clear() self.results.clear() self.log_file = None self._404_status = -1 self.conn_pool = None self.index_status, self.index_headers, self.index_html_doc = None, {}, '' # scan from a given URL def init_from_url(self, url): self.reset_scanner() self.url = 'http://' + url if url.find('://') < 0 else url self.schema, self.host, self.path = parse_url(url) self.domain_sub = get_domain_sub(self.host) self.init_final() def init_from_log_file(self, log_file): self.reset_scanner() self.log_file = log_file self.schema, self.host, self.path = self._parse_url_from_file() self.domain_sub = get_domain_sub(self.host) if self.host: self.load_all_urls_from_log_file() self.init_final() return True else: host = os.path.basename(log_file).replace('.log', '') try: socket.gethostbyname(host) self.init_from_url(host) return True except Exception as e: print_msg('[ERROR] Invalid host from log name: %s' % host) return False # def init_final(self): try: if self.conn_pool: self.conn_pool.close() except Exception as e: pass default_port = 443 if self.schema.lower() == 'https' else 80 self.host, self.port = self.host.split( ':') if self.host.find(':') > 0 else (self.host, default_port) self.port = int(self.port) if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443: self.base_url = '%s://%s' % (self.schema, self.host) else: self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port) is_port_open = self.is_port_open() if is_port_open: if self.schema == 'https': self.conn_pool = HTTPSConnPool(self.host, port=self.port, maxsize=self.args.t * 2, headers=HEADERS) else: self.conn_pool = HTTPConnPool(self.host, port=self.port, maxsize=self.args.t * 2, headers=HEADERS) if self.args.scripts_only or (not is_port_open and not self.args.no_scripts): for _ in self.user_scripts: self.url_queue.put((_, '/')) print_msg('Scan with scripts: %s' % self.host) return if not is_port_open: return self.max_depth = cal_depth(self, self.path)[1] + 5 if self.args.no_check404: self._404_status = 404 self.has_status_404 = True else: self.check_404_existence() if self._404_status == -1: print_msg('[Warning] HTTP 404 check failed <%s:%s>' % (self.host, self.port)) elif not self.has_status_404: print_msg('[Warning] %s has no HTTP 404.' % self.base_url) _path, _depth = cal_depth(self, self.path) self.enqueue('/') self.enqueue(_path) if not self.args.no_crawl and not self.log_file: self.crawl(_path) def is_port_open(self): try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(5.0) if s.connect_ex((self.host, int(self.port))) == 0: print_msg('scan web: %s:%s' % (self.host, self.port)) return True else: print_msg('[Warning] Fail to connect to %s' % self.base_url) return False except Exception as e: return False finally: s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, struct.pack('ii', 1, 0)) s.close() # def _parse_url_from_file(self): url = '' with open(self.log_file) as infile: for _line in infile.xreadlines(): _line = _line.strip() if _line and len(_line.split()) >= 3: url = _line.split()[1] break return parse_url(url) # # load urls from rules/*.txt def _init_rules(self): self.text_to_find = [] self.regex_to_find = [] self.text_to_exclude = [] self.regex_to_exclude = [] self.rules_set = set() self.rules_set_root_only = set() p_tag = re.compile('{tag="(.*?)"}') p_status = re.compile(r'{status=(\d{3})}') p_content_type = re.compile('{type="(.*?)"}') p_content_type_no = re.compile('{type_no="(.*?)"}') for rule_file in glob.glob('rules/*.txt'): with open(rule_file, 'r') as infile: vul_type = os.path.basename(rule_file)[:-4] for url in infile.xreadlines(): url = url.strip() if url.startswith('/'): _ = p_tag.search(url) tag = _.group(1) if _ else '' _ = p_status.search(url) status = int(_.group(1)) if _ else 0 _ = p_content_type.search(url) content_type = _.group(1) if _ else '' _ = p_content_type_no.search(url) content_type_no = _.group(1) if _ else '' root_only = True if url.find( '{root_only}') >= 0 else False rule = (url.split()[0], tag, status, content_type, content_type_no, root_only, vul_type) if root_only: if rule not in self.rules_set_root_only: self.rules_set_root_only.add(rule) else: print_msg('Duplicated root only rule: %s' % str(rule)) else: if rule not in self.rules_set: self.rules_set.add(rule) else: print_msg('Duplicated rule: %s' % str(rule)) re_text = re.compile('{text="(.*)"}') re_regex_text = re.compile('{regex_text="(.*)"}') file_path = 'rules/white.list' if not os.path.exists(file_path): print_msg('[ERROR] File not exist: %s' % file_path) return for _line in open(file_path): _line = _line.strip() if not _line or _line.startswith('#'): continue _m = re_text.search(_line) if _m: self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore')) else: _m = re_regex_text.search(_line) if _m: self.regex_to_find.append( re.compile(_m.group(1).decode('utf-8', 'ignore'))) file_path = 'rules/black.list' if not os.path.exists(file_path): print_msg('[ERROR] File not exist: %s' % file_path) return for _line in open(file_path): _line = _line.strip() if not _line or _line.startswith('#'): continue _m = re_text.search(_line) if _m: self.text_to_exclude.append( _m.group(1).decode('utf-8', 'ignore')) else: _m = re_regex_text.search(_line) if _m: self.regex_to_exclude.append( re.compile(_m.group(1).decode('utf-8', 'ignore'))) # def _init_scripts(self): self.user_scripts = [] if self.args.no_scripts: # disable user scripts scan return for _script in glob.glob('scripts/*.py'): script_name = os.path.basename(_script).replace('.py', '') if script_name.startswith('_'): continue try: self.user_scripts.append( importlib.import_module('scripts.%s' % script_name)) except Exception as e: print_msg('[ERROR] Fail to load script %s' % script_name) # def http_request(self, url, headers=HEADERS, timeout=30): try: if not url: url = '/' # print_msg('request %s' % self.base_url + url) resp = self.conn_pool.urlopen('GET', self.base_url + url, headers=headers, redirect=False, timeout=timeout, retries=0) status = resp.status if resp.headers.get('content-type', '').find('text') >= 0 \ or resp.headers.get('content-type', '').find('html') >= 0 \ or int(resp.headers.get('content-length', '0')) <= 20480: # 1024 * 20 html_doc = decode_response_text(resp.data) else: html_doc = '' return status, resp.headers, html_doc except Exception as e: return -1, {}, '' # check existence of HTTP 404 def check_404_existence(self): try: try: self._404_status, _, html_doc = self.http_request( '/BBScan-404-existence-check') except Exception as e: print_msg('[Warning] HTTP 404 check failed <%s:%s>' % (self.host, self.port)) self._404_status, _, html_doc = -1, {}, '' if self._404_status == 404: self.has_status_404 = True else: self.has_status_404 = False self.len_404_doc = len(html_doc) except Exception as e: logging.error('[Check_404] Exception %s %s' % (self.base_url, str(e))) # def enqueue(self, url): try: url = str(url) url_pattern = re.sub(r'\d+', '{num}', url) if url_pattern in self.urls_processed or len( self.urls_processed) >= self.links_limit: return False else: self.urls_processed.add(url_pattern) # print_msg('Entered Queue: %s' % url) self.crawl(url) if self._404_status != -1: # valid web service rule_set_to_process = [ self.rules_set, self.rules_set_root_only ] if url == '/' else [self.rules_set] for rule_set in rule_set_to_process: for _ in rule_set: if _[5] and url != '/': # root only continue try: full_url = url.rstrip('/') + _[0] except Exception as e: continue if full_url in self.urls_enqueued: continue url_description = { 'prefix': url.rstrip('/'), 'full_url': full_url } item = (url_description, _[1], _[2], _[3], _[4], _[5], _[6]) self.url_queue.put(item) self.urls_enqueued.add(full_url) if self.args.full_scan and url.count('/') >= 2: self.enqueue('/'.join(url.split('/')[:-2]) + '/') # sub folder enqueue for script in self.user_scripts: self.url_queue.put((script, url)) return True except Exception as e: print '[_enqueue.exception] %s' % str(e) return False # def crawl(self, path): try: headers = dict( HEADERS, Range='bytes=0-204800') # allowed size increased to 200 kb status, headers, html_doc = self.http_request(path, headers=headers) if path == '/': self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc if self.index_html_doc: soup = BeautifulSoup(html_doc, "html.parser") for link in soup.find_all('a'): url = link.get('href', '').strip() if url.startswith('..'): continue if not url.startswith('/') and url.find('//') < 0: url = path + url url, depth = cal_depth(self, url) # print url, depth if depth <= self.max_depth: self.enqueue(url) ret = self.find_text(html_doc) if ret: if '/' not in self.results: self.results['/'] = [] m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' _ = { 'status': status, 'url': '%s%s' % (self.base_url, path), 'title': title, 'vul_type': ret[1] } if _ not in self.results['/']: self.results['/'].append(_) except Exception as e: print_msg('[crawl Exception] %s %s' % (path, str(e))) traceback.print_exc() # def load_all_urls_from_log_file(self): try: with open(self.log_file) as infile: for _line in infile.xreadlines(): _ = _line.strip().split() if len(_) == 3 and (_[2].find('^^^200') > 0 or _[2].find('^^^403') > 0 or _[2].find('^^^302') > 0): url, depth = cal_depth(self, _[1]) self.enqueue(url) except Exception as e: print_msg('[load_all_urls_from_log_file] %s' % str(e)) # def find_text(self, html_doc): for _text in self.text_to_find: if html_doc.find(_text) >= 0: return True, 'Found [%s]' % _text for _regex in self.regex_to_find: if _regex.search(html_doc): return True, 'Found Regex [%s]' % _regex.pattern return False # def find_exclude_text(self, html_doc): for _text in self.text_to_exclude: if html_doc.find(_text) >= 0: return True for _regex in self.regex_to_exclude: if _regex.search(html_doc): return True return False # def scan_worker(self): while self.url_queue.qsize() > 0: if time.time() - self.start_time > self.time_out: self.url_queue.queue.clear() print_msg('[ERROR] Timed out task: %s' % self.base_url) return try: item = self.url_queue.get(timeout=0.1) except Exception as e: return try: if len(item) == 2: # Script Scan check_func = getattr(item[0], 'do_check') # print_msg('Begin %s %s' % (os.path.basename(item[0].__file__), item[1])) check_func(self, item[1]) # print_msg('End %s %s' % (os.path.basename(item[0].__file__), item[1])) continue else: url_description, tag, status_to_match, content_type, content_type_no, root_only, vul_type = item prefix = url_description['prefix'] url = url_description['full_url'] if url.find('{sub}') >= 0: if not self.domain_sub: continue url = url.replace('{sub}', self.domain_sub) except Exception as e: print_msg('[scan_worker.1] %s' % str(e)) traceback.print_exc() continue if not item or not url: break # print_msg('[%s]' % url.strip()) try: status, headers, html_doc = self.http_request(url) cur_content_type = headers.get('content-type', '') cur_content_length = headers.get('content-length', len(html_doc)) if self.find_exclude_text(html_doc): # excluded text found continue if 0 <= int(cur_content_length) <= 10: # text too short continue if cur_content_type.find('image/') >= 0: # exclude image continue if content_type != 'application/json' and cur_content_type.find('application/json') >= 0 and \ not url.endswith('.json'): # invalid json continue if content_type and cur_content_type.find(content_type) < 0 \ or content_type_no and cur_content_type.find(content_type_no) >= 0: continue # content type mismatch if tag and html_doc.find(tag) < 0: continue # tag mismatch if self.find_text(html_doc): valid_item = True else: # status code check if status_to_match == 206 and status != 206: continue if status_to_match in (200, 206) and status in (200, 206): valid_item = True elif status_to_match and status != status_to_match: continue elif status in (403, 404) and status != status_to_match: continue else: valid_item = True if status == self._404_status and url != '/': len_doc = len(html_doc) len_sum = self.len_404_doc + len_doc if len_sum == 0 or (0.4 <= float(len_doc) / len_sum <= 0.6): continue if valid_item: m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' self.lock.acquire() # print '[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host + url) if prefix not in self.results: self.results[prefix] = [] _ = { 'status': status, 'url': '%s%s' % (self.base_url, url), 'title': title, 'vul_type': vul_type } if _ not in self.results[prefix]: self.results[prefix].append(_) self.lock.release() except Exception as e: print_msg('[scan_worker.2][%s] %s' % (url, str(e))) traceback.print_exc() # def scan(self, threads=6): try: all_threads = [] for i in range(threads): t = threading.Thread(target=self.scan_worker) t.start() all_threads.append(t) for t in all_threads: t.join() for key in self.results.keys(): if len( self.results[key] ) > 5: # Over 5 URLs found under this folder, show first only self.results[key] = self.results[key][:1] return self.host, self.results except Exception as e: print '[scan exception] %s' % str(e) self.conn_pool.close()