Python HTTPConnPool.urlopen Examples

Programming Language: Python

Namespace/Package Name: lib.connectionPool

Class/Type: HTTPConnPool

Method/Function: urlopen

Examples at hotexamples.com: 3

Python HTTPConnPool.urlopen - 3 examples found. These are the top rated real world Python examples of lib.connectionPool.HTTPConnPool.urlopen extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HTTPConnPool(3)

close(3)

urlopen(3)

Example #1

Show file

File: BBScan.py Project: binarytrails/bbscan

class Scanner(object):
    def __init__(self, q_results, timeout=600, args=None):
        self.q_results = q_results
        self.args = args
        self.start_time = time.time()
        self.time_out = timeout
        self.links_limit = 100  # max number of folders to scan

        self._init_rules()
        self._init_scripts()

        self.url_queue = Queue.Queue()  # all urls to scan
        self.urls_processed = set()     # processed urls
        self.urls_enqueued = set()      # entered queue urls
        self.urls_crawled = set()

        self.lock = threading.Lock()
        self.results = {}
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.scheme, self.host, self.port, self.path = None, None, None, None
        self.domain_sub = ''
        self.base_url = ''
        self.max_depth = 0
        self.len_404_doc = 0
        self.has_http = None
        self.ports_open = None
        self.ports_closed = None
        self.no_scripts = None
        self.status_502_count = 0

    def print_msg(self, msg):
        self.q_results.put(msg)

    def reset_scanner(self):
        self.start_time = time.time()
        self.url_queue.queue.clear()
        self.urls_processed.clear()
        self.urls_enqueued.clear()
        self.urls_crawled.clear()
        self.results.clear()
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.scheme, self.host, self.port, self.path = None, None, None, None
        self.domain_sub = ''
        self.base_url = ''
        self.status_502_count = 0

    # scan from a given URL
    def init_from_url(self, target):
        self.reset_scanner()
        self.scheme = target['scheme']
        self.host = target['host']
        self.port = target['port']
        self.path = target['path']
        self.has_http = target['has_http']
        self.ports_open = target['ports_open']
        self.ports_closed = target['ports_closed']
        self.no_scripts = target['no_scripts'] if 'no_scripts' in target else 0
        self.domain_sub = get_domain_sub(self.host)
        self.init_final()
        return True

    def init_from_log_file(self, log_file):
        self.reset_scanner()
        self.log_file = log_file
        self.scheme, self.host, self.path = self._parse_url_from_file()
        self.domain_sub = get_domain_sub(self.host)
        if self.host:
            if self.host.find(':') > 0:
                _ret = self.host.split(':')
                self.host = _ret[0]
                self.port = _ret[1]
            elif self.scheme == 'https':
                self.port = 443
            elif self.scheme == 'http':
                self.port = 80
            else:
                self.port = None
            if not is_port_open(self.host, self.port):
                self.print_msg('[Port Not Open] %s:%s' % (self.host, self.port))
                return False
            self.has_http = True
            self.no_scripts = 1
            self.init_final()
            self.load_all_urls_from_log_file()
            return True
        else:
            host = os.path.basename(log_file).replace('.log', '')
            try:
                socket.gethostbyname(host)
                self.init_from_url(host)     # Fix Me
                return True
            except Exception as e:
                self.print_msg('[ERROR] Invalid host from log name: %s' % host)
                return False

    def init_final(self):
        try:
            if self.conn_pool:
                self.conn_pool.close()
        except Exception as e:
            pass

        if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.scheme, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.scheme, self.host, self.port)

        if self.has_http:
            self.print_msg('Scan %s' % self.base_url)
        else:
            self.print_msg('Scan %s:%s' % (self.host, self.port) if self.port else 'Scan %s' % self.host)

        if self.has_http:
            if self.scheme == 'https':
                self.conn_pool = HTTPSConnPool(self.host, port=self.port, maxsize=self.args.t,
                                               headers=config.default_headers)
            else:
                self.conn_pool = HTTPConnPool(self.host, port=self.port, maxsize=self.args.t,
                                              headers=config.default_headers)
            if self.args.require_index_doc:
                self.crawl('/', do_not_process_links=True)

        if self.no_scripts != 1:   # 不是重复目标 80 443 跳转的，不需要重复扫描
            # 当前目标disable， 或者 全局开启插件扫描
            if self.args.scripts_only or not self.no_scripts:
                for _ in self.user_scripts:
                    self.url_queue.put((_, '/'))

        if not self.has_http or self.args.scripts_only:    # 未发现HTTP服务 或  只依赖插件扫描
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
        else:
            self.check_404_existence()
        if self._404_status == -1:
            self.print_msg('[Warning] HTTP 404 check failed <%s:%s>' % (self.host, self.port))
        elif self._404_status != 404:
            self.print_msg('[Warning] %s has no HTTP 404.' % self.base_url)
        _path, _depth = cal_depth(self, self.path)

        self.enqueue('/')
        if _path != '/' and not self.log_file:
            self.enqueue(_path)

    #
    def _parse_url_from_file(self):
        url = ''
        with open(self.log_file) as infile:
            for _line in infile.xreadlines():
                _line = _line.strip()
                if _line and len(_line.split()) >= 3:
                    url = _line.split()[1]
                    break
        return parse_url(url)

    # load urls from rules/*.txt
    def _init_rules(self):
        self.text_to_find = []
        self.regex_to_find = []
        self.text_to_exclude = []
        self.regex_to_exclude = []
        self.rules_set = set()
        self.rules_set_root_only = set()

        p_tag = re.compile('{tag="(.*?)"}')
        p_status = re.compile(r'{status=(\d{3})}')
        p_content_type = re.compile('{type="(.*?)"}')
        p_content_type_no = re.compile('{type_no="(.*?)"}')

        _files = self.args.rule_files if self.args.rule_files else glob.glob('rules/*.txt')

        for rule_file in _files:
            with open(rule_file, 'r') as infile:
                vul_type = os.path.basename(rule_file)[:-4]
                for url in infile.xreadlines():
                    url = url.strip()
                    if url.startswith('/'):
                        _ = p_tag.search(url)
                        tag = _.group(1) if _ else ''

                        _ = p_status.search(url)
                        status = int(_.group(1)) if _ else 0

                        _ = p_content_type.search(url)
                        content_type = _.group(1) if _ else ''

                        _ = p_content_type_no.search(url)
                        content_type_no = _.group(1) if _ else ''

                        root_only = True if url.find('{root_only}') >= 0 else False

                        rule = (url.split()[0], tag, status, content_type, content_type_no, root_only, vul_type)
                        if root_only:
                            if rule not in self.rules_set_root_only:
                                self.rules_set_root_only.add(rule)
                            else:
                                self.print_msg('Duplicated root only rule: %s' % str(rule))
                        else:
                            if rule not in self.rules_set:
                                self.rules_set.add(rule)
                            else:
                                self.print_msg('Duplicated rule: %s' % str(rule))

        re_text = re.compile('{text="(.*)"}')
        re_regex_text = re.compile('{regex_text="(.*)"}')

        file_path = 'rules/white.list'
        if not os.path.exists(file_path):
            self.print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_find.append(re.compile(_m.group(1).decode('utf-8', 'ignore')))

        file_path = 'rules/black.list'
        if not os.path.exists(file_path):
            self.print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_exclude.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_exclude.append(re.compile(_m.group(1).decode('utf-8', 'ignore')))

    def _init_scripts(self):
        self.user_scripts = []
        if self.args.no_scripts:    # 全局禁用插件，无需导入
            return
        for _script in glob.glob('scripts/*.py'):
            script_name_origin = os.path.basename(_script)
            script_name = script_name_origin.replace('.py', '')
            if self.args.script:    # 只导入指定的脚本
                if script_name not in self.args.script and script_name_origin not in self.args.script:
                    continue
            if script_name.startswith('_'):
                continue
            try:
                self.user_scripts.append(importlib.import_module('scripts.%s' % script_name))
            except Exception as e:
                self.print_msg('[ERROR] Fail to load script %s' % script_name)

    def http_request(self, url, headers=config.default_headers, timeout=20):
        try:
            if not url:
                url = '/'
            if not self.conn_pool:
                return -1, {}, ''
            if self.args.debug:
                self.print_msg('--> %s' % self.base_url + url)
            resp = self.conn_pool.urlopen('GET', self.base_url + url,
                                          headers=headers, assert_same_host=False,
                                          redirect=False, timeout=timeout, retries=0)
            if resp.headers.get('content-type', '').find('text') >= 0 \
                    or resp.headers.get('content-type', '').find('html') >= 0 \
                    or int(resp.headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            if resp.status == 502:    # 502出现3次以上，排除该站点
                self.status_502_count += 1
                if self.status_502_count > 3:
                    self.url_queue.queue.clear()
                    try:
                        if self.conn_pool:
                            self.conn_pool.close()
                    except Exception as e:
                        pass
                    self.conn_pool = None
                    # self.print_msg('Website 502: %s' % self.base_url)

            return resp.status, resp.headers, html_doc
        except urllib3.exceptions.MaxRetryError as e:
            return -1, {}, ''
        except TypeError as e:
            return -1, {}, ''
        except Exception as e:
            self.print_msg(str(e))
            return -1, {}, ''

    # check existence of status 404
    def check_404_existence(self):
        try:
            try:
                self._404_status, _, html_doc = self.http_request('/BBScan-404-existence-check')
            except Exception as e:
                self.print_msg('[Warning] HTTP 404 check failed: %s' % self.base_url)
                self._404_status, _, html_doc = -1, {}, ''
            if self._404_status != 404:
                self.len_404_doc = len(html_doc)
        except Exception as e:
            self.print_msg('[Check_404] Exception %s %s' % (self.base_url, str(e)))

    #
    def enqueue(self, url):
        try:
            url = str(url)
        except Exception as e:
            return False
        try:
            url_pattern = re.sub(r'\d+', '{num}', url)
            if url_pattern in self.urls_processed or len(self.urls_processed) >= self.links_limit:
                return False

            self.urls_processed.add(url_pattern)
            # self.print_msg('Entered Queue: %s' % url)
            if not self.args.no_crawl:   # no crawl
                self.crawl(url)
            if self._404_status != -1:    # valid web service
                rule_set_to_process = [self.rules_set, self.rules_set_root_only] if url == '/' else [self.rules_set]
                for rule_set in rule_set_to_process:
                    for _ in rule_set:
                        if _[5] and url != '/':    # root only
                            continue
                        try:
                            full_url = url.rstrip('/') + _[0]
                        except Exception as e:
                            continue
                        if full_url in self.urls_enqueued:
                            continue
                        url_description = {'prefix': url.rstrip('/'), 'full_url': full_url}
                        item = (url_description, _[1], _[2], _[3], _[4], _[5], _[6])
                        self.url_queue.put(item)
                        self.urls_enqueued.add(full_url)

            if self.args.full_scan and url.count('/') >= 2:
                self.enqueue('/'.join(url.split('/')[:-2]) + '/')  # sub folder enqueue

            if url != '/'and not self.no_scripts:
                for script in self.user_scripts:
                    self.url_queue.put((script, url))
            return True
        except Exception as e:
            self.print_msg('[_enqueue.exception] %s' % str(e))
            return False

    #
    def crawl(self, path, do_not_process_links=False):
        try:
            # increase body size to 200 KB
            headers = dict(config.default_headers, Range='bytes=0-204800')
            status, headers, html_doc = self.http_request(path, headers=headers)
            if path == '/':
                self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc
            if not self.args.no_crawl and not do_not_process_links and html_doc:
                soup = BeautifulSoup(html_doc, "html.parser")
                for link in soup.find_all('a'):
                    url = link.get('href', '').strip()
                    if url.startswith('..'):
                        continue
                    if not url.startswith('/') and url.find('//') < 0:   # relative path
                        url = path + url
                    url, depth = cal_depth(self, url)
                    # print url, depth
                    if depth <= self.max_depth:
                        self.enqueue(url)
                #
                ret = self.find_text(html_doc)
                if ret:
                    if '/' not in self.results:
                        self.results['/'] = []
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    _ = {'status': status, 'url': '%s%s' % (self.base_url, path), 'title': title, 'vul_type': ret[1]}
                    if _ not in self.results['/']:
                        self.results['/'].append(_)

        except Exception as e:
            self.print_msg('[crawl Exception] %s %s' % (path, str(e)))

    #
    def load_all_urls_from_log_file(self):
        try:
            with open(self.log_file) as infile:
                for _line in infile.xreadlines():
                    _ = _line.strip().split()
                    if len(_) == 3 and (_[2].find('^^^200') > 0 or _[2].find('^^^403') > 0 or _[2].find('^^^302') > 0):
                        url, depth = cal_depth(self, _[1])
                        self.enqueue(url)
        except Exception as e:
            self.print_msg('[load_all_urls_from_log_file] %s' % str(e))

    #
    def find_text(self, html_doc):
        for _text in self.text_to_find:
            if html_doc.find(_text) >= 0:
                return True, 'Found [%s]' % _text
        for _regex in self.regex_to_find:
            if _regex.search(html_doc):
                return True, 'Found Regex [%s]' % _regex.pattern
        return False

    #
    def find_exclude_text(self, html_doc):
        for _text in self.text_to_exclude:
            if html_doc.find(_text) >= 0:
                return True
        for _regex in self.regex_to_exclude:
            if _regex.search(html_doc):
                return True
        return False

    #
    def scan_worker(self):
        while True:
            if time.time() - self.start_time > self.time_out:
                self.url_queue.queue.clear()
                self.print_msg('[ERROR] Timed out task: %s' % self.base_url)
                return
            try:
                item = self.url_queue.get(timeout=0.1)
            except Exception as e:
                return
            try:
                if len(item) == 2:  # Script Scan
                    check_func = getattr(item[0], 'do_check')
                    # self.print_msg('Begin %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    check_func(self, item[1])
                    # self.print_msg('End %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    continue
                else:
                    url_description, tag, status_to_match, content_type, content_type_no, root_only, vul_type = item
                    prefix = url_description['prefix']
                    url = url_description['full_url']

                    if url.find('{sub}') >= 0:
                        if not self.domain_sub:
                            continue
                        url = url.replace('{sub}', self.domain_sub)

            except Exception as e:
                self.print_msg('[scan_worker.1] %s' % str(e))
                self.print_msg(traceback.format_exc())
                continue
            if not item or not url:
                break

            try:
                status, headers, html_doc = self.http_request(url)
                cur_content_type = headers.get('content-type', '')
                cur_content_length = headers.get('content-length', len(html_doc))

                if self.find_exclude_text(html_doc):  # excluded text found
                    continue

                if 0 <= int(cur_content_length) <= 10:  # text too short
                    continue

                if cur_content_type.find('image/') >= 0:  # exclude image
                    continue

                if content_type != 'application/json' and cur_content_type.find('application/json') >= 0 and \
                        not url.endswith('.json'):    # invalid json
                    continue

                if content_type and cur_content_type.find(content_type) < 0 \
                        or content_type_no and cur_content_type.find(content_type_no) >= 0:
                    continue    # content type mismatch

                if tag and html_doc.find(tag) < 0:
                    continue    # tag mismatch

                if self.find_text(html_doc):
                    valid_item = True
                else:
                    # status code check
                    if status_to_match == 206 and status != 206:
                        continue
                    if status_to_match in (200, 206) and status in (200, 206):
                        valid_item = True
                    elif status_to_match and status != status_to_match:
                        continue
                    elif status in (403, 404) and status != status_to_match:
                        continue
                    else:
                        valid_item = True

                    if status == self._404_status and url != '/':
                        len_doc = len(html_doc)
                        len_sum = self.len_404_doc + len_doc
                        if len_sum == 0 or (0.4 <= float(len_doc) / len_sum <= 0.6):
                            continue

                if valid_item:
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    self.lock.acquire()
                    # self.print_msg('[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host +  url))
                    if prefix not in self.results:
                        self.results[prefix] = []
                    _ = {'status': status, 'url': '%s%s' % (self.base_url, url), 'title': title, 'vul_type': vul_type}
                    if _ not in self.results[prefix]:
                        self.results[prefix].append(_)
                    self.lock.release()
            except Exception as e:
                self.print_msg('[scan_worker.2][%s] %s' % (url, str(e)))
                traceback.print_exc()

    #
    def scan(self, threads=6):
        try:
            all_threads = []
            for i in range(threads):
                t = threading.Thread(target=self.scan_worker)
                t.start()
                all_threads.append(t)
            for t in all_threads:
                t.join()

            for key in self.results.keys():
                # Over 5 URLs found under this folder, keep the first one only
                if len(self.results[key]) > 5:
                    self.results[key] = self.results[key][:1]
            return self.base_url.lstrip('unknown://').rstrip(':None'), self.results
        except Exception as e:
            self.print_msg('[scan exception] %s' % str(e))
        self.conn_pool.close()

Example #2

Show file

File: BBScan.py Project: 5alt/BBScan

class InfoDisScanner(object):
    def __init__(self, timeout=600, args=None):
        self.args = args
        self.START_TIME = time.time()
        self.TIME_OUT = timeout
        self.LINKS_LIMIT = 100  # max number of Folders to scan

        self.full_scan = args.full_scan
        self._init_rules()
        self._init_scripts()

        self.url_queue = Queue.Queue()  # all urls to scan
        self.urls_processed = set()  # processed urls
        self.urls_enqueued = set()  # entered queue urls

        self.lock = threading.Lock()

    # reset scanner
    def init_reset(self):
        self.START_TIME = time.time()
        self.url_queue.queue.clear()
        self.urls_processed = set()
        self.urls_enqueued = set()
        self.index_a_urls = set()
        self.scripts_enqueued = set()
        self.results = {}
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.rewrite = False
        self.server = ''
        self.lang = ''

    # scan from a given URL
    def init_from_url(self, url):
        self.init_reset()
        if not url.find('://') > 0:
            self.url = 'http://' + url
        else:
            self.url = url
        self.schema, self.host, self.path = parse_url(url)
        self.domain_sub = get_domain_sub(self.host)
        self.init_final()

    def init_from_log_file(self, log_file):
        self.init_reset()
        self.log_file = log_file
        self.schema, self.host, self.path = self._parse_url_from_file()
        self.domain_sub = get_domain_sub(self.host)
        if self.host:
            self.load_all_urls_from_log_file()
            self.init_final()
        else:
            self.init_from_url(os.path.basename(log_file).replace('.log', ''))

    #
    def init_final(self):
        try:
            self.conn_pool.close()
        except:
            pass
        default_port = 443 if self.schema.lower() == 'https' else 80
        self.host, self.port = self.host.split(
            ':') if self.host.find(':') > 0 else (self.host, default_port)
        self.port = int(self.port)
        if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.schema, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port)

        is_port_open = self.is_port_open()
        if is_port_open:
            if self.schema == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t * 2,
                                               headers=headers)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t * 2,
                                              headers=headers)

        if not is_port_open:
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
            self.has_404 = True
        else:
            self.check_404()  # check existence of HTTP 404
        if not self.has_404:
            print_msg('[Warning] %s has no HTTP 404.' % self.host)

        self.request_index(self.path)
        self.gather_info()

        _path, _depth = cal_depth(self, self.path)
        self._enqueue('/')
        self._enqueue(_path)
        if not self.args.no_crawl and not self.log_file:
            self.crawl_index()

    def is_port_open(self):
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(5.0)
            if s.connect_ex((self.host, int(self.port))) == 0:
                self.lock.acquire()
                print_msg('Scan web: %s' % self.base_url)
                self.lock.release()
                return True
            else:
                print_msg('[Warning] Fail to connect to %s:%s' %
                          (self.host, self.port))
                return False
        except Exception as e:
            return False
        finally:
            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
                         struct.pack('ii', 1, 0))
            s.close()

    #
    def _parse_url_from_file(self):
        url = ''
        with open(self.log_file) as infile:
            for line in infile.xreadlines():
                line = line.strip()
                if line and len(line.split()) >= 3:
                    url = line.split()[1]
                    break
        return parse_url(url)

    def _load_rules(self, rule_file):
        rules = []
        p_tag = re.compile('{tag="(.*?)"}')
        p_status = re.compile('{status=(\d{3})}')
        p_content_type = re.compile('{type="(.*?)"}')
        p_content_type_no = re.compile('{type_no="(.*?)"}')
        p_lang = re.compile('{lang="(.*?)"}')
        with open(rule_file, 'r') as infile:
            for url in infile.xreadlines():
                url = url.strip()
                if url.startswith('/'):
                    _ = p_tag.search(url)
                    tag = _.group(1) if _ else ''

                    _ = p_status.search(url)
                    status = int(_.group(1)) if _ else 0

                    _ = p_content_type.search(url)
                    content_type = _.group(1) if _ else ''

                    _ = p_content_type_no.search(url)
                    content_type_no = _.group(1) if _ else ''

                    _ = p_lang.search(url)
                    lang = _.group(1) if _ else ''

                    root_only = True if url.find('{root_only}') >= 0 else False

                    rewrite = True if url.find('{rewrite}') >= 0 else False

                    rule = (url.split()[0], tag, status, content_type,
                            content_type_no, root_only, lang, rewrite)
                    rules.append(rule)
        return rules

    #
    # load urls from rules/*.txt
    def _init_rules(self):
        self.text_to_find = []
        self.regex_to_find = []
        self.text_to_exclude = []
        self.regex_to_exclude = []
        self.rules_set = set()

        for rule_file in glob.glob('rules/*.txt'):
            rules = self._load_rules(rule_file)
            for rule in rules:
                if rule not in self.rules_set:
                    self.rules_set.add(rule)
                else:
                    print 'Dumplicated Rule:', rule

        re_text = re.compile('{text="(.*)"}')
        re_regex_text = re.compile('{regex_text="(.*)"}')

        _file_path = 'rules/white.list'
        if not os.path.exists(_file_path):
            return
        for line in open(_file_path):
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            _m = re_text.search(line)
            if _m:
                self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(line)
                if _m:
                    self.regex_to_find.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

        _file_path = 'rules/black.list'
        if not os.path.exists(_file_path):
            return
        for line in open(_file_path):
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            _m = re_text.search(line)
            if _m:
                self.text_to_exclude.append(
                    _m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(line)
                if _m:
                    self.regex_to_exclude.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

    #
    def _init_scripts(self):
        self.user_scripts = []
        if self.args.no_scripts:  # disable user scripts scan
            return
        for _script in glob.glob('scripts/*.py'):
            script_name = os.path.basename(_script).replace('.py', '')
            if script_name.startswith('_'):
                continue
            try:
                _ = importlib.import_module('scripts.%s' % script_name)
                self.user_scripts.append(_)
            except Exception as e:
                print e

    #
    def _http_request(self, url, timeout=30):
        try:
            if not url:
                url = '/'
            # print 'request', self.base_url + url
            resp = self.conn_pool.urlopen('GET',
                                          self.base_url + url,
                                          redirect=False,
                                          timeout=timeout,
                                          retries=0)
            resp_headers = resp.headers
            status = resp.status
            if resp_headers.get('content-type', '').find('text') >= 0 \
                    or resp_headers.get('content-type', '').find('html') >= 0 \
                    or int(resp_headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            return status, resp_headers, html_doc
        except Exception as e:
            return -1, {}, ''

    #
    def check_404(self):
        try:
            try:
                self._404_status, headers, html_doc = self._http_request(
                    '/BBScan-404-existence-check')
            except:
                self._404_status, headers, html_doc = -1, {}, ''

            self.has_404 = (self._404_status == 404)
            if not self.has_404:
                self.len_404_doc = len(html_doc)
            return self.has_404
        except Exception as e:
            logging.error('[Check_404] Exception %s' % str(e))

    def _enqueue_request(self, prefix, full_url, rule):
        if self.args.scripts_only:
            return
        if full_url in self.urls_enqueued:
            return
        url_description = {'prefix': prefix, 'full_url': full_url}
        item = (url_description, rule[1], rule[2], rule[3], rule[4], rule[5],
                rule[6], rule[7])
        self.url_queue.put(item)
        self.urls_enqueued.add(full_url)

    def _enqueue_script(self, module, prefix):
        if self.args.no_scripts:
            return
        if not prefix: prefix = '/'
        if (module.__name__, prefix) in self.scripts_enqueued: return
        self.url_queue.put((module, prefix))
        self.scripts_enqueued.add((module.__name__, prefix))

    #
    def _enqueue(self, url):
        try:
            url = str(url)
            url_pattern = re.sub('\d+', '{num}', url)
            if url_pattern in self.urls_processed or len(
                    self.urls_processed) >= self.LINKS_LIMIT:
                return False
            else:
                self.urls_processed.add(url_pattern)
            # print 'Entered Queue:', url
            for _ in self.rules_set:
                # rewrite & lang check
                if self.rewrite and not _[7]:
                    continue
                elif self.lang and self.lang != 'unknown':
                    if _[6] and self.lang != _[6]:
                        continue

                # root_only
                if _[5] and url != '/':
                    continue

                full_url = url.rstrip('/') + _[0]
                self._enqueue_request(url.rstrip('/'), full_url, _)

            if self.full_scan and url.count('/') >= 2:
                self._enqueue('/'.join(url.split('/')[:-2]) +
                              '/')  # sub folder enqueue

            for _ in self.user_scripts:
                self._enqueue_script(_, url.rstrip('/'))

            return True
        except Exception as e:
            print '[_enqueue.exception] %s' % str(e)
            return False

    #
    def request_index(self, path):
        try:
            status, headers, html_doc = self._http_request(path)
            if status != 200:
                try:
                    html_doc = self.conn_pool.urlopen(
                        'GET',
                        self.url,
                        headers=headers_without_range,
                        retries=1).data
                    html_doc = decode_response_text(html_doc)
                except Exception as e:
                    pass
            self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc  # save index content
            soup = BeautifulSoup(self.index_html_doc, "html.parser")
            for link in soup.find_all('a'):
                url = link.get('href', '').strip()
                self.index_a_urls.add(url)

        except Exception as e:
            logging.error('[request_index Exception] %s' % str(e))
            traceback.print_exc()

    def gather_info(self):
        if not self.server:
            self.server = check_server(self.index_headers.get('server', ''))

        if not self.lang:
            self.lang, self.framework = check_lang(self.base_url,
                                                   self.index_headers)

        if self.lang == 'unknown':
            for url in self.index_a_urls:
                url, depth = cal_depth(self, url)
                lang = check_lang_url(url)
                if lang != 'unknown':
                    self.lang = lang
                    break
        self.rewrite = check_rewrite(self.server, self.lang)

    def crawl_index(self):
        for url in self.index_a_urls:
            url, depth = cal_depth(self, url)
            if depth <= self.max_depth:
                self._enqueue(url)
        if self.find_text(self.index_html_doc):
            self.results['/'] = []
            m = re.search('<title>(.*?)</title>', self.index_html_doc)
            title = m.group(1) if m else ''
            _ = {
                'status': self.index_status,
                'url': '%s%s' % (self.base_url, self.path),
                'title': title
            }
            if _ not in self.results['/']:
                self.results['/'].append(_)

    #
    def load_all_urls_from_log_file(self):
        try:
            with open(self.log_file) as inFile:
                for line in inFile.xreadlines():
                    _ = line.strip().split()
                    if len(_) == 3 and (_[2].find('^^^200') > 0
                                        or _[2].find('^^^403') > 0
                                        or _[2].find('^^^302') > 0):
                        url, depth = cal_depth(self, _[1])
                        self._enqueue(url)
        except Exception as e:
            logging.error('[load_all_urls_from_log_file Exception] %s' %
                          str(e))
            traceback.print_exc()

    #
    def find_text(self, html_doc):
        for _text in self.text_to_find:
            if html_doc.find(_text) > 0:
                return True
        for _regex in self.regex_to_find:
            if _regex.search(html_doc) > 0:
                return True
        return False

    #
    def find_exclude_text(self, html_doc):
        for _text in self.text_to_exclude:
            if html_doc.find(_text) >= 0:
                return True
        for _regex in self.regex_to_exclude:
            if _regex.search(html_doc):
                return True
        return False

    def apply_rules(self, item):
        url_description, tag, status_to_match, content_type, content_type_no, root_only, lang, rewrite = item
        prefix = url_description['prefix']
        url = url_description['full_url']
        # print url
        url = url.replace('{sub}', self.domain_sub)
        if url.find('{hostname_or_folder}') >= 0:
            _url = url[:url.find('{hostname_or_folder}')]
            folders = _url.split('/')
            for _folder in reversed(folders):
                if _folder not in ['', '.', '..']:
                    url = url.replace('{hostname_or_folder}', _folder)
                    break
        url = url.replace('{hostname_or_folder}', self.domain_sub)
        url = url.replace('{hostname}', self.domain_sub)

        if not item or not url:
            return False, None, None, None

        # print '[%s]' % url.strip()
        try:
            status, headers, html_doc = self._http_request(url)
            cur_content_type = headers.get('content-type', '')

            if self.find_exclude_text(html_doc):  # excluded text found
                return False, status, headers, html_doc

            if ('html' in cur_content_type or 'text' in cur_content_type) and \
                                    0 <= len(html_doc) <= 10:  # text too short
                return False, status, headers, html_doc

            if cur_content_type.find('image/') >= 0:  # exclude image
                return False, status, headers, html_doc

            valid_item = False
            if self.find_text(html_doc):
                valid_item = True
            else:
                if cur_content_type.find(
                        'application/json') >= 0 and not url.endswith(
                            '.json'):  # no json
                    return False, status, headers, html_doc

                if status != status_to_match and status != 206:  # status in [301, 302, 400, 404, 501, 502, 503, 505]
                    return False, status, headers, html_doc

                if tag:
                    if html_doc.find(tag) >= 0:
                        valid_item = True
                    else:
                        return False, status, headers, html_doc  # tag mismatch

                if (content_type and cur_content_type.find(content_type) < 0) \
                        or (content_type_no and cur_content_type.find(content_type_no) >= 0):
                    return False, status, headers, html_doc  # type mismatch

                if self.has_404 or status != self._404_status:
                    if status_to_match in (200, 206) and status == 206:
                        valid_item = True
                    elif status_to_match and status != status_to_match:  # status mismatch
                        return False, status, headers, html_doc
                    elif status_to_match != 403 and status == 403:
                        return False, status, headers, html_doc
                    else:
                        valid_item = True

                if not self.has_404 and status in (
                        200, 206) and url != '/' and not tag:
                    _len = len(html_doc)
                    _min = min(_len, self.len_404_doc)
                    if _min == 0:
                        _min = 10.0
                    if float(_len - self.len_404_doc) / _min > 0.3:
                        valid_item = True

                if status == 206 and tag == '' and cur_content_type.find(
                        'text') < 0 and cur_content_type.find('html') < 0:
                    valid_item = True

            return valid_item, status, headers, html_doc

        except Exception as e:
            logging.error('[_scan_worker.Exception][3][%s] %s' % (url, str(e)))
            traceback.print_exc()

    #
    def _scan_worker(self):
        while self.url_queue.qsize() > 0:
            if time.time() - self.START_TIME > self.TIME_OUT:
                self.url_queue.queue.clear()
                print_msg('[ERROR] Timed out task: %s' % self.host)
                return
            try:
                item = self.url_queue.get(timeout=0.1)
            except Exception as e:
                print e
                return
            try:
                if len(item) == 2:  # User Script
                    check_func = getattr(item[0], 'do_check')
                    check_func(self, item[1])
                    continue
            except Exception as e:
                logging.error('[_scan_worker Exception] [1] %s' % str(e))
                traceback.print_exc()
                continue

            url_description, tag, status_to_match, content_type, content_type_no, root_only, lang, rewrite = item
            prefix = url_description['prefix']
            url = url_description['full_url']
            valid_item, status, headers, html_doc = self.apply_rules(item)

            try:
                if valid_item:
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    self.lock.acquire()
                    # print '[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host +  url)
                    if prefix not in self.results:
                        self.results[prefix] = []
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, url),
                        'title': title
                    }
                    if _ not in self.results[prefix]:
                        self.results[prefix].append(_)
                    self.lock.release()

                if len(self.results) >= 10:
                    print '[Warning] Over 10 vulnerabilities found [%s], seems to be false positives.' % prefix
                    self.url_queue.queue.clear()
            except Exception as e:
                logging.error('[_scan_worker.Exception][2][%s] %s' %
                              (url, str(e)))
                traceback.print_exc()

    #
    def scan(self, threads=6):
        try:
            all_threads = []
            for i in range(threads):
                t = threading.Thread(target=self._scan_worker)
                t.start()
                all_threads.append(t)
            for t in all_threads:
                t.join()
            '''
            for key in self.results.keys():
                if len(self.results[key]) > 5:  # Over 5 URLs found under this folder, show first only
                    self.results[key] = self.results[key][:1]
            '''
            return '%s:%s' % (self.host, self.port), self.results
        except Exception as e:
            print '[scan exception] %s' % str(e)
        self.conn_pool.close()

Example #3

Show file

class Scanner(object):
    def __init__(self, timeout=600, args=None):
        self.args = args
        self.start_time = time.time()
        self.time_out = timeout
        self.links_limit = 100  # max number of folders to scan

        self._init_rules()
        self._init_scripts()

        self.url_queue = Queue.Queue()  # all urls to scan
        self.urls_processed = set()  # processed urls
        self.urls_enqueued = set()  # entered queue urls
        self.urls_crawled = set()

        self.lock = threading.Lock()
        self.results = {}
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.url = ''
        self.schema, self.host, self.port, self.path = None, None, None, None
        self.domain_sub = self.base_url = ''
        self.has_status_404 = True
        self.max_depth = 0
        self.len_404_doc = 0

    # reset scanner
    def reset_scanner(self):
        self.start_time = time.time()
        self.url_queue.queue.clear()
        self.urls_processed.clear()
        self.urls_enqueued.clear()
        self.urls_crawled.clear()
        self.results.clear()
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''

    # scan from a given URL
    def init_from_url(self, url):
        self.reset_scanner()
        self.url = 'http://' + url if url.find('://') < 0 else url
        self.schema, self.host, self.path = parse_url(url)
        self.domain_sub = get_domain_sub(self.host)
        self.init_final()

    def init_from_log_file(self, log_file):
        self.reset_scanner()
        self.log_file = log_file
        self.schema, self.host, self.path = self._parse_url_from_file()
        self.domain_sub = get_domain_sub(self.host)
        if self.host:
            self.load_all_urls_from_log_file()
            self.init_final()
            return True
        else:
            host = os.path.basename(log_file).replace('.log', '')
            try:
                socket.gethostbyname(host)
                self.init_from_url(host)
                return True
            except Exception as e:
                print_msg('[ERROR] Invalid host from log name: %s' % host)
                return False

    #
    def init_final(self):
        try:
            if self.conn_pool:
                self.conn_pool.close()
        except Exception as e:
            pass
        default_port = 443 if self.schema.lower() == 'https' else 80
        self.host, self.port = self.host.split(
            ':') if self.host.find(':') > 0 else (self.host, default_port)
        self.port = int(self.port)
        if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.schema, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port)

        is_port_open = self.is_port_open()
        if is_port_open:
            if self.schema == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t * 2,
                                               headers=HEADERS)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t * 2,
                                              headers=HEADERS)

        if self.args.scripts_only or (not is_port_open
                                      and not self.args.no_scripts):
            for _ in self.user_scripts:
                self.url_queue.put((_, '/'))
            print_msg('Scan with scripts: %s' % self.host)
            return

        if not is_port_open:
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
            self.has_status_404 = True
        else:
            self.check_404_existence()
        if self._404_status == -1:
            print_msg('[Warning] HTTP 404 check failed <%s:%s>' %
                      (self.host, self.port))
        elif not self.has_status_404:
            print_msg('[Warning] %s has no HTTP 404.' % self.base_url)
        _path, _depth = cal_depth(self, self.path)
        self.enqueue('/')
        self.enqueue(_path)
        if not self.args.no_crawl and not self.log_file:
            self.crawl(_path)

    def is_port_open(self):
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(5.0)
            if s.connect_ex((self.host, int(self.port))) == 0:
                print_msg('scan web: %s:%s' % (self.host, self.port))
                return True
            else:
                print_msg('[Warning] Fail to connect to %s' % self.base_url)
                return False
        except Exception as e:
            return False
        finally:
            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
                         struct.pack('ii', 1, 0))
            s.close()

    #
    def _parse_url_from_file(self):
        url = ''
        with open(self.log_file) as infile:
            for _line in infile.xreadlines():
                _line = _line.strip()
                if _line and len(_line.split()) >= 3:
                    url = _line.split()[1]
                    break
        return parse_url(url)

    #
    # load urls from rules/*.txt
    def _init_rules(self):
        self.text_to_find = []
        self.regex_to_find = []
        self.text_to_exclude = []
        self.regex_to_exclude = []
        self.rules_set = set()
        self.rules_set_root_only = set()

        p_tag = re.compile('{tag="(.*?)"}')
        p_status = re.compile(r'{status=(\d{3})}')
        p_content_type = re.compile('{type="(.*?)"}')
        p_content_type_no = re.compile('{type_no="(.*?)"}')

        for rule_file in glob.glob('rules/*.txt'):
            with open(rule_file, 'r') as infile:
                vul_type = os.path.basename(rule_file)[:-4]
                for url in infile.xreadlines():
                    url = url.strip()
                    if url.startswith('/'):
                        _ = p_tag.search(url)
                        tag = _.group(1) if _ else ''

                        _ = p_status.search(url)
                        status = int(_.group(1)) if _ else 0

                        _ = p_content_type.search(url)
                        content_type = _.group(1) if _ else ''

                        _ = p_content_type_no.search(url)
                        content_type_no = _.group(1) if _ else ''

                        root_only = True if url.find(
                            '{root_only}') >= 0 else False

                        rule = (url.split()[0], tag, status, content_type,
                                content_type_no, root_only, vul_type)
                        if root_only:
                            if rule not in self.rules_set_root_only:
                                self.rules_set_root_only.add(rule)
                            else:
                                print_msg('Duplicated root only rule: %s' %
                                          str(rule))
                        else:
                            if rule not in self.rules_set:
                                self.rules_set.add(rule)
                            else:
                                print_msg('Duplicated rule: %s' % str(rule))

        re_text = re.compile('{text="(.*)"}')
        re_regex_text = re.compile('{regex_text="(.*)"}')

        file_path = 'rules/white.list'
        if not os.path.exists(file_path):
            print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_find.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

        file_path = 'rules/black.list'
        if not os.path.exists(file_path):
            print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_exclude.append(
                    _m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_exclude.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

    #
    def _init_scripts(self):
        self.user_scripts = []
        if self.args.no_scripts:  # disable user scripts scan
            return
        for _script in glob.glob('scripts/*.py'):
            script_name = os.path.basename(_script).replace('.py', '')
            if script_name.startswith('_'):
                continue
            try:
                self.user_scripts.append(
                    importlib.import_module('scripts.%s' % script_name))
            except Exception as e:
                print_msg('[ERROR] Fail to load script %s' % script_name)

    #
    def http_request(self, url, headers=HEADERS, timeout=30):
        try:
            if not url:
                url = '/'
            # print_msg('request %s' % self.base_url + url)
            resp = self.conn_pool.urlopen('GET',
                                          self.base_url + url,
                                          headers=headers,
                                          redirect=False,
                                          timeout=timeout,
                                          retries=0)
            status = resp.status
            if resp.headers.get('content-type', '').find('text') >= 0 \
                    or resp.headers.get('content-type', '').find('html') >= 0 \
                    or int(resp.headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            return status, resp.headers, html_doc
        except Exception as e:
            return -1, {}, ''

    # check existence of HTTP 404
    def check_404_existence(self):
        try:
            try:
                self._404_status, _, html_doc = self.http_request(
                    '/BBScan-404-existence-check')
            except Exception as e:
                print_msg('[Warning] HTTP 404 check failed <%s:%s>' %
                          (self.host, self.port))
                self._404_status, _, html_doc = -1, {}, ''
            if self._404_status == 404:
                self.has_status_404 = True
            else:
                self.has_status_404 = False
                self.len_404_doc = len(html_doc)
        except Exception as e:
            logging.error('[Check_404] Exception %s %s' %
                          (self.base_url, str(e)))

    #
    def enqueue(self, url):
        try:
            url = str(url)
            url_pattern = re.sub(r'\d+', '{num}', url)
            if url_pattern in self.urls_processed or len(
                    self.urls_processed) >= self.links_limit:
                return False
            else:
                self.urls_processed.add(url_pattern)
            # print_msg('Entered Queue: %s' % url)
            self.crawl(url)
            if self._404_status != -1:  # valid web service
                rule_set_to_process = [
                    self.rules_set, self.rules_set_root_only
                ] if url == '/' else [self.rules_set]
                for rule_set in rule_set_to_process:
                    for _ in rule_set:
                        if _[5] and url != '/':  # root only
                            continue
                        try:
                            full_url = url.rstrip('/') + _[0]
                        except Exception as e:
                            continue
                        if full_url in self.urls_enqueued:
                            continue
                        url_description = {
                            'prefix': url.rstrip('/'),
                            'full_url': full_url
                        }
                        item = (url_description, _[1], _[2], _[3], _[4], _[5],
                                _[6])
                        self.url_queue.put(item)
                        self.urls_enqueued.add(full_url)

            if self.args.full_scan and url.count('/') >= 2:
                self.enqueue('/'.join(url.split('/')[:-2]) +
                             '/')  # sub folder enqueue

            for script in self.user_scripts:
                self.url_queue.put((script, url))
            return True
        except Exception as e:
            print '[_enqueue.exception] %s' % str(e)
            return False

    #
    def crawl(self, path):
        try:
            headers = dict(
                HEADERS,
                Range='bytes=0-204800')  # allowed size increased to 200 kb
            status, headers, html_doc = self.http_request(path,
                                                          headers=headers)
            if path == '/':
                self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc
            if self.index_html_doc:
                soup = BeautifulSoup(html_doc, "html.parser")
                for link in soup.find_all('a'):
                    url = link.get('href', '').strip()
                    if url.startswith('..'):
                        continue
                    if not url.startswith('/') and url.find('//') < 0:
                        url = path + url
                    url, depth = cal_depth(self, url)
                    # print url, depth
                    if depth <= self.max_depth:
                        self.enqueue(url)
                ret = self.find_text(html_doc)
                if ret:
                    if '/' not in self.results:
                        self.results['/'] = []
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, path),
                        'title': title,
                        'vul_type': ret[1]
                    }
                    if _ not in self.results['/']:
                        self.results['/'].append(_)

        except Exception as e:
            print_msg('[crawl Exception] %s %s' % (path, str(e)))
            traceback.print_exc()

    #
    def load_all_urls_from_log_file(self):
        try:
            with open(self.log_file) as infile:
                for _line in infile.xreadlines():
                    _ = _line.strip().split()
                    if len(_) == 3 and (_[2].find('^^^200') > 0
                                        or _[2].find('^^^403') > 0
                                        or _[2].find('^^^302') > 0):
                        url, depth = cal_depth(self, _[1])
                        self.enqueue(url)
        except Exception as e:
            print_msg('[load_all_urls_from_log_file] %s' % str(e))

    #
    def find_text(self, html_doc):
        for _text in self.text_to_find:
            if html_doc.find(_text) >= 0:
                return True, 'Found [%s]' % _text
        for _regex in self.regex_to_find:
            if _regex.search(html_doc):
                return True, 'Found Regex [%s]' % _regex.pattern
        return False

    #
    def find_exclude_text(self, html_doc):
        for _text in self.text_to_exclude:
            if html_doc.find(_text) >= 0:
                return True
        for _regex in self.regex_to_exclude:
            if _regex.search(html_doc):
                return True
        return False

    #
    def scan_worker(self):
        while self.url_queue.qsize() > 0:
            if time.time() - self.start_time > self.time_out:
                self.url_queue.queue.clear()
                print_msg('[ERROR] Timed out task: %s' % self.base_url)
                return
            try:
                item = self.url_queue.get(timeout=0.1)
            except Exception as e:
                return
            try:
                if len(item) == 2:  # Script Scan
                    check_func = getattr(item[0], 'do_check')
                    # print_msg('Begin %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    check_func(self, item[1])
                    # print_msg('End %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    continue
                else:
                    url_description, tag, status_to_match, content_type, content_type_no, root_only, vul_type = item
                    prefix = url_description['prefix']
                    url = url_description['full_url']

                    if url.find('{sub}') >= 0:
                        if not self.domain_sub:
                            continue
                        url = url.replace('{sub}', self.domain_sub)

            except Exception as e:
                print_msg('[scan_worker.1] %s' % str(e))
                traceback.print_exc()
                continue
            if not item or not url:
                break

            # print_msg('[%s]' % url.strip())
            try:
                status, headers, html_doc = self.http_request(url)
                cur_content_type = headers.get('content-type', '')
                cur_content_length = headers.get('content-length',
                                                 len(html_doc))

                if self.find_exclude_text(html_doc):  # excluded text found
                    continue

                if 0 <= int(cur_content_length) <= 10:  # text too short
                    continue

                if cur_content_type.find('image/') >= 0:  # exclude image
                    continue

                if content_type != 'application/json' and cur_content_type.find('application/json') >= 0 and \
                        not url.endswith('.json'):    # invalid json
                    continue

                if content_type and cur_content_type.find(content_type) < 0 \
                        or content_type_no and cur_content_type.find(content_type_no) >= 0:
                    continue  # content type mismatch

                if tag and html_doc.find(tag) < 0:
                    continue  # tag mismatch

                if self.find_text(html_doc):
                    valid_item = True
                else:
                    # status code check
                    if status_to_match == 206 and status != 206:
                        continue
                    if status_to_match in (200, 206) and status in (200, 206):
                        valid_item = True
                    elif status_to_match and status != status_to_match:
                        continue
                    elif status in (403, 404) and status != status_to_match:
                        continue
                    else:
                        valid_item = True

                    if status == self._404_status and url != '/':
                        len_doc = len(html_doc)
                        len_sum = self.len_404_doc + len_doc
                        if len_sum == 0 or (0.4 <= float(len_doc) / len_sum <=
                                            0.6):
                            continue

                if valid_item:
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    self.lock.acquire()
                    # print '[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host +  url)
                    if prefix not in self.results:
                        self.results[prefix] = []
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, url),
                        'title': title,
                        'vul_type': vul_type
                    }
                    if _ not in self.results[prefix]:
                        self.results[prefix].append(_)
                    self.lock.release()
            except Exception as e:
                print_msg('[scan_worker.2][%s] %s' % (url, str(e)))
                traceback.print_exc()

    #
    def scan(self, threads=6):
        try:
            all_threads = []
            for i in range(threads):
                t = threading.Thread(target=self.scan_worker)
                t.start()
                all_threads.append(t)
            for t in all_threads:
                t.join()

            for key in self.results.keys():
                if len(
                        self.results[key]
                ) > 5:  # Over 5 URLs found under this folder, show first only
                    self.results[key] = self.results[key][:1]
            return self.host, self.results
        except Exception as e:
            print '[scan exception] %s' % str(e)
        self.conn_pool.close()