def masscan(target, ports): output = os.path.join(PATHS.OUTPUT_PATH, "output_" + str(time.time()) + ".log") cmd = "masscan -p {} --rate={} --randomize-hosts -iL \"{}\" -oL \"{}\"".format( ports, MASSCAN_RATE, target, output) os.system(cmd) logger.debug("masscan saved output:" + output) open_list = [] with open(output, "r") as f: result_json = f.readlines() if result_json: try: del result_json[0] del result_json[-1] open_list = {} for res in result_json: try: p = res.split() ip = p[3] port = p[2] if ip not in open_list: open_list[ip] = set() open_list[ip].add(port) except: pass except Exception as e: logger.error("masscan read faild") if open_list: return open_list return None
def load_remote_poc(): filename = os.path.join(PATHS.DATA_PATH, "api.json") api_lock = os.path.join(PATHS.DATA_PATH, "api.lock") # 每隔10天更新一次api if not os.path.exists(api_lock): with open(api_lock, "w") as f: f.write(str(time.time())) with open(api_lock) as f: last_time = float(f.read()) logger.debug("api last time:{}".format(last_time)) if time.time() - last_time > 60 * 60 * 24 * 10: with open(api_lock, "w") as f: f.write(str(time.time())) logger.info("update airbug api...") _middle = "/master" _suffix = "/API.json" _profix = WEB_REPOSITORY.replace("github.com", "raw.githubusercontent.com") _api = _profix + _middle + _suffix r = requests.get(_api) datas = json.loads(r.text, encoding='utf-8') for data in datas: data["webfile"] = _profix + _middle + data["filepath"] with open(filename, "w") as f: json.dump(datas, f) with open(filename) as f: datas = json.load(f) return datas
def update_conf(path, section, option, value): logger.debug("Update tentacle config: [%s][%s] => %s" % (section, option, value)) cf = configparser.ConfigParser() cf.set(section, option, value) with open(path, 'w+') as configfile: cf.write(configfile)
def __del__(self): ''' 销毁 ''' self.conn.commit() self.conn.close() logger.debug('destroy database object')
async def go_request(req_list, source): async with ClientSession() as session: for req in req_list: url = req['url'] method = req['method'] headers = req['headers'] logger.debug("Curling %s..." % (url)) proxy = conf['config']['crawlergo']['http_proxy'] username = conf['config']['crawlergo']['username'] password = conf['config']['crawlergo']['password'] if username.strip() != '' and password.strip() != '': proxy_auth = BasicAuth(username, password) else: proxy_auth = None try: logger.debug("Xray scan {}, from url {} ".format(url, source)) async with session.request(method, url=url, headers=headers, proxy=proxy, proxy_auth=proxy_auth) as res: pass except: pass
def nmapscan(host, ports): # 接受从masscan上扫描出来的结果 # 为了可以多线程使用,此函数支持多线程调用 nm = nmap.PortScanner() argument = "-sV -sS -Pn --host-timeout 1m -p{}".format(','.join(ports)) try: ret = nm.scan(host, arguments=argument) except nmap.PortScannerError: logger.debug("Nmap PortScannerError host:{}".format(host)) return None except: return None # debug elapsed = ret["nmap"]["scanstats"]["elapsed"] command_line = ret["nmap"]["command_line"] logger.debug("[nmap] successed,elapsed:%s command_line:%s" % (elapsed, command_line)) if host in ret["scan"]: try: result = ret["scan"][host]["tcp"] except KeyError: return None return result return None
async def run(self): async with aiohttp.ClientSession() as session: flag = await self.check_engine_available(session, self.engine) if not flag: logger.error( "{engine_name} is not available, skipping!".format( engine_name=self.engine_name)) return logger.debug("{engine_name} is available, starting!".format( engine_name=self.engine_name)) data = {'inputurl': self.target} content = await self.get(session, self.base_url, method="POST", data=data, headers=self.headers, timeout=self.timeout, proxy=self.proxy) ret = self.check_response_errors(content) if not ret[0]: self.deal_with_errors(ret[1]) self.extract(content) logger.sysinfo("{engine} Found {num} sites".format( engine=self.engine_name, num=len(self.results['subdomain']))) logger.debug(self.engine_name + " " + str(len(self.results['subdomain'])))
def init_conf(path): logger.debug("Init tentacle config...") configs = { "basic": { "thread_num": "100", "looptimer": str(12*60*60), "timeout": "5", "user_agent": '\n'.join([ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.0.16 (.NET CLR 3.5.30729)', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; de-de) AppleWebKit/531.22.7 (KHTML, like Gecko) Version/4.0.5 Safari/531.22.7', 'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.1b4) Gecko/20090423 Firefox/3.5b4 (.NET CLR 3.5.30729)', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5', 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.14) Gecko/2009082505 Red Hat/3.0.14-1.el5_4 Firefox/3.0.14', 'Mozilla/5.0 (X11; U; Linux i686; tr-TR; rv:1.9.0.10) Gecko/2009042523 Ubuntu/9.04 (jaunty) Firefox/3.0.10', 'Opera/9.80 (Macintosh; Intel Mac OS X; U; nl) Presto/2.6.30 Version/10.61', ]) }, "smtp": { "mail_host": "smtp.163.com", "mail_port": str(465), "mail_user": "******", "mail_pass": "******", "sender": "*****@*****.**", "receivers":"[email protected],[email protected]", }, "proxy": { "proxy": False, "http_proxy": "http://127.0.0.1:1080", "https_proxy": "https://127.0.0.1:1080" }, "google_api": { "developer_key": "developer_key", "search_enging": "search_enging" }, # 下面接口以后再说 # "zoomeye_api": { # "username": "******", # "password": "******" # }, # "fofa_api": { # "email": "*****@*****.**", # "token": "*****@*****.**" # }, # "shodan_api": { # "token": "token@tentacle" # }, # "github_api": { # "token": "token@tentacle", # }, } cf = configparser.ConfigParser() for section in configs.keys(): cf[section] = configs[section] with open(path, 'w+') as configfile: cf.write(configfile) sys.exit(logger.error("Please set the tentacle config in submon.conf..."))
def extract(self, content): pattern = re.compile( '<a href="javascript:" onclick="window.open.*?" target="_blank">(.*?{domain})</a>' .format(domain=self.target)) next_page = "下一页" try: links = pattern.findall(content) for link in links: if not link.startswith('http://') and not link.startswith( 'https://'): link = "http://" + link subdomain = parse.urlparse(link).netloc if subdomain != self.target and subdomain.endswith( self.target): if subdomain not in self.results['subdomain']: logger.debug("{engine} Found {subdomain}".format( engine=self.engine_name, subdomain=subdomain)) self.results['subdomain'].append(subdomain) except Exception: pass if next_page in content: # tell engine there still be next page return True else: return False
def hand_ip(self, serviceTypes, option='masscan'): ip_list = [] for item in serviceTypes: ip_list.append(item["target"]) ports = MASSCAN_DEFAULT_PORT result2 = {} if option == 'masscan': if MASSCAN_FULL_SCAN: ports = "1-65535" target = os.path.join(PATHS.OUTPUT_PATH, "target_{0}.log".format(time.time())) with open(target, "w+") as fp: fp.write('\n'.join(ip_list)) logger.debug("ip:" + repr(ip_list)) try: result = masscan(target, ports) except Exception as e: logger.error("masscan error msg:{}".format(repr(e))) result = None if result is None: return None # format:{'115.159.39.75': ['80'], '115.159.39.215': ['80', '3306'],} for host, ports in result.items(): ports = list(ports) if host not in result2: result2[host] = [] task_update("running", 1) try: result_nmap = nmapscan(host, ports) except: result_nmap = None task_update("running", -1) if result_nmap is None: for tmp_port in ports: result2[host].append({"port": tmp_port}) continue tmp_r = self.nmap_result_handle(result_nmap, host=host) result2.update(tmp_r) elif option == "nmap": logger.debug("ip:" + repr(ip_list)) for host in ip_list: result_nmap = nmapscan(host, ports.split(",")) tmp_r = self.nmap_result_handle(result_nmap, host=host) if tmp_r: result2.update(tmp_r) data = {} for ip in result2.keys(): # result2[ip] if ip not in data: data[ip] = {} d = ip_location.poc(ip) if d: data[ip]["location"] = d data[ip]["infos"] = result2[ip] collector.add_ips(data) for ip in result2.keys(): collector.send_ok_ip(ip)
async def run(self): async with aiohttp.ClientSession() as session: flag = await self.check_engine_available(session, self.engine) if not flag: logger.error( "{engine_name} is not available, skipping!".format( engine_name=self.engine_name)) return logger.debug("{engine_name} is available, starting!".format( engine_name=self.engine_name)) data = {'inputurl': self.target} async with session.post(self.base_url, proxy=self.proxy, data=data) as res: if res != None: try: content = await res.text() except: content = "" ret = self.check_response_errors(content) if not ret[0]: self.deal_with_errors(ret[1]) self.extract(content) logger.sysinfo("{engine} Found {num} sites".format( engine=self.engine_name, num=len(self.results['subdomain']))) logger.debug(self.engine_name + " " + str(len(self.results['subdomain'])))
def load_conf(path): logger.debug("Load tentacle config...") cf = configparser.ConfigParser() cf.read(path) sections = cf.sections() configs = {} for section in sections: logger.debug("Load config: %s" % (section)) config = {} for option in cf.options(section): config[option] = cf.get(section,option) configs[section] = config conf['config'] = configs
def recursion_deep(self): ''' 根据深度值进行爬取 operate['db'].deep 当前深度 self.deep 需要爬取的深度 :return: ''' if operate['db'].deep == 0: logger.info("spidering deep == 0 page") r = self.get_html(self.url) try: html = r['html'] except: print "url input error!" logger.error("url error(%s)" % (self.url)) return operate['db'].insert(html, self.url) self.r_group.append(r) operate['db'].deep += 1 self.recursion_deep() elif operate['db'].deep > self.deep: logger.info('spider deep over!') return else: logger.info("spidering deep = %s" % operate['db'].deep) tmp = [] url_group = [] # 从上一个deep爬取的页面中提取url for x in self.r_group: html = x['html'] url_group.extend(self.find_url(html)) logger.debug("from %s page find %s url" % (x['url'], len(url_group))) # 当页面没匹配出任何url, 则结束退出 if url_group == []: return # 把提取出来的url丢入线程池中 result_list = self._thread.my_map(url_group) for y in xrange(len(result_list)): if result_list[y]['type'] == 'html': tmp.append(result_list[y]) else: logger.debug("delete the not html page (%s)" % url_group[y]) self.r_group = tmp operate['db'].deep += 1 self.recursion_deep()
def find_url(self, html): ''' 使用BeautifulSoup找出网页中的url :param html: html页面 :return: 返回一个list, 其值为html中url PS: 暂只考虑a标签中的href属性中的url ''' url_group = [] logger.debug("start find url in a html") try: bs = BeautifulSoup(html, 'lxml') except Exception, e: logger.error("bs4(html) fail!\nthe error info is : " + str(e)) return
def recursion_deep(self): ''' 根据深度值进行爬取 operate['db'].deep 当前深度 self.deep 需要爬取的深度 :return: ''' if operate['db'].deep == 0: logger.info("spidering deep == 0 page") r = self.get_html(self.url) try: html = r['html'] except: print "url input error!" logger.error("url error(%s)" %(self.url)) return operate['db'].insert(html, self.url) self.r_group.append(r) operate['db'].deep += 1 self.recursion_deep() elif operate['db'].deep > self.deep: logger.info('spider deep over!') return else: logger.info("spidering deep = %s" %operate['db'].deep) tmp = [] url_group = [] # 从上一个deep爬取的页面中提取url for x in self.r_group: html = x['html'] url_group.extend(self.find_url(html)) logger.debug("from %s page find %s url" %(x['url'], len(url_group))) # 当页面没匹配出任何url, 则结束退出 if url_group == []: return # 把提取出来的url丢入线程池中 result_list = self._thread.my_map(url_group) for y in xrange(len(result_list)): if result_list[y]['type'] == 'html': tmp.append(result_list[y]) else: logger.debug("delete the not html page (%s)" % url_group[y]) self.r_group = tmp operate['db'].deep += 1 self.recursion_deep()
def insert(self, html, url): ''' 必须参数 :param html: 抓取的页面内容 :param url: 抓取页面的url :param deep: 该页面的深度 :return: ''' in_sql = "INSERT INTO spider VALUES (null, ?, ?, ?, ?)" # 当设置了关键词然后关键词不在页面中时 if conf['key'] and conf['key'] not in html: logger.debug("the address: "+url+" @@ is not exist the key: " + conf['key']) return None keyword = (conf['key'] if conf['key'] else '') self.cur.execute(in_sql, (html, url, self.deep, keyword)) logger.debug("INSERT suc and id is: " + str(self.cur.lastrowid)) self.conn.commit()
def insert(self, html, url): ''' 必须参数 :param html: 抓取的页面内容 :param url: 抓取页面的url :param deep: 该页面的深度 :return: ''' in_sql = "INSERT INTO spider VALUES (null, ?, ?, ?, ?)" # 当设置了关键词然后关键词不在页面中时 if conf['key'] and conf['key'] not in html: logger.debug("the address: " + url + " @@ is not exist the key: " + conf['key']) return None keyword = (conf['key'] if conf['key'] else '') self.cur.execute(in_sql, (html, url, self.deep, keyword)) logger.debug("INSERT suc and id is: " + str(self.cur.lastrowid)) self.conn.commit()
async def get_title(req_list): ret = [] async with ClientSession() as session: for subdomain in req_list: try: logger.debug("Curling %s..." % (subdomain)) flag = False for pro in ['http://', "https://"]: url = pro + subdomain + '/' async with session.get(url=url) as response: if response != None: try: res = await response.read() except: res = "" status = response.status try: res = str(res, 'utf-8') except UnicodeDecodeError: res = str(res, 'gbk') except: res = "网页编码错误" m = re.search('<title>(.*)<\/title>', res.lower()) if m != None and m.group(1): title = m.group(1) else: title = '网页没有标题' try: length = int( response.headers['content-length']) except: length = len(str(response.headers)) + len(res) ret.append([subdomain, url, title, status, length]) flag = True break if not flag: ret.append([subdomain, "", "", 0, 0]) except Exception as e: logger.error(str(e)) return ret
def execute(self, request: Request, response: Response): self.target = '' self.requests = request self.response = response output = None try: output = self.audit() except NotImplementedError: logger.error('Plugin: {0} not defined "{1} mode'.format( self.name, 'audit')) except ConnectTimeout: retry = RETRY while retry > 0: logger.debug('Plugin: {0} timeout, start it over.'.format( self.name)) try: output = self.audit() break except ConnectTimeout: logger.debug('POC: {0} time-out retry failed!'.format( self.name)) retry -= 1 else: msg = "connect target '{0}' failed!".format(self.target) logger.error(msg) except HTTPError as e: logger.warning( 'Plugin: {0} HTTPError occurs, start it over.'.format( self.name)) except ConnectionError as e: msg = "connect target '{0}' failed!".format(self.target) logger.error(msg) except TooManyRedirects as e: logger.error(str(e)) except Exception as e: logger.error(str(e)) return output
def __init__(self, dbfile): ''' 初始化 :param dbfile: sqlite3 数据库文件 ''' logger.debug('init database') self.deep = 0 dbfile = "db/" + dbfile self.conn = sqlite3.connect(dbfile, check_same_thread=False) self.cur = self.conn.cursor() check_table_sql = "select count(*) from sqlite_master where type='table' and name='spider'" self.cur.execute(check_table_sql) if self.cur.fetchone()[0] == 0: logger.debug('create a spider table') self.creab_table() else: logger.debug('spider table already exist')
def oparser(): ''' 设置, 初始化输入参数 ''' parser = OptionParser() parser.version = "B0.8" parser.add_option("--version", "-v", dest="showVersion", action="store_true", help="Show program's version number and exit") # 必选参数 target = OptionGroup(parser, "Target", "At least one of these " "options has to be provided to define the target(s)") target.add_option("-u", dest="url", help="Target URL") target.add_option("-d", dest="deep", type="int", help="spider the depth") target.add_option("--testself", dest="test", action="store_true", help="auto test") # 可选参数 opt = OptionGroup(parser, "Options", "Optional parameters") opt.add_option("-f", dest="logfile", help="The custom log file path") opt.add_option("--key", dest="key", help="Page keywords") opt.add_option("-l", dest="loglevel", type="int", help="log level(1-5) " "1, CRITICAL; " "2, ERROR(default); " "3, WARN; " "4, INFO; " "5, DEBUG;") opt.add_option("--thread", dest="thread", type="int", help="thread number(default 10)") opt.add_option("--dbfile", dest="dbfile", help="set sqlite database file") parser.add_option_group(target) parser.add_option_group(opt) (args, _) = parser.parse_args(sys.argv) if args.showVersion: print parser.version print "-- By Hcamael" exit(0) if not (args.url or args.test): errMsg = "missing a mandatory option (-u) or (--testself), " errMsg += "use -h for basic or -hh for advanced help" parser.error(errMsg) # 进行输入参数初始化 conf['url'] = (args.url if args.url else "http://sina.com.cn") name = re.findall("[\w\.-]+", conf['url']) try: conf['name'] = (name[1] if len(name) == 2 else name[0]) except IndexError: errMsg = "url input error!" logger.error("url matching fail!") parser.error(errMsg) conf['deep'] = (args.deep if args.deep else 2) if conf['deep'] > 50: # 上限为50 errMsg = "The deep is too large(0 <= deep <= 50)" parser.error(errMsg) if conf['deep'] < 0: # 禁止小于0 errMsg = "The deep input error(0 < deep <= 50)" parser.error(errMsg) conf['test'] = args.test conf['key'] = args.key conf['loglevel'] = (args.loglevel if args.loglevel else 2) if conf['loglevel'] < 1 or conf['loglevel'] > 5: # loglevel: 1-5 errMsg = "loglevel value error(input 1-5)" parser.error(errMsg) if conf['loglevel'] == 1: loglevel = logging.CRITICAL elif conf['loglevel'] == 2: loglevel = logging.ERROR elif conf['loglevel'] == 3: loglevel = logging.WARN elif conf['loglevel'] == 4: loglevel = logging.INFO elif conf['loglevel'] == 5: loglevel = logging.DEBUG else: loglevel = logging.ERROR conf['logfile'] = (os.path.basename(args.logfile) if args.logfile \ else 'spider.log' if args.test \ else conf['name']+".log") f = open("log/" + conf['logfile'], 'a+') Log_Handle = logging.StreamHandler(f) # Log_Handle = logging.StreamHandler(sys.stdout) FORMATTER = logging.Formatter("\r[%(asctime)s] [%(levelname)s] [%(thread)d] %(message)s", "%H:%M:%S") Log_Handle.setFormatter(FORMATTER) logger.addHandler(Log_Handle) logger.setLevel(loglevel) conf['dbfile'] = (os.path.basename(args.dbfile) if args.dbfile else conf['name']+".db") conf['thread'] = (args.thread if args.thread else 10) if conf['thread'] < 0 or conf['thread'] > 50: # thread = 0 表示不使用多线程 # 最大值设为50 errMsg = "thread value error (0-50, 0 means not use thread)" parser.error(errMsg) logger.debug('parsing command line suc')
def hand_domain(self, serviceType): target = serviceType["target"] logger.info(target) # 添加这条记录 collector.add_domain(target) # 发起请求 try: r = requests.get(target, timeout=30, verify=False, allow_redirects=False) collector.add_domain_info(target, { "headers": r.headers, "body": r.text, "status_code": r.status_code }) except Exception as e: logger.error("request url error:" + str(e)) collector.del_domain(target) return logger.debug("target:{} over,start to scan".format(target)) # Get hostname # ???????????WDNMD hostname = urlparse(target).netloc.split(":")[0] if not is_ip_address_format(hostname): try: # return the host from socket _ip = socket.gethostbyname(hostname) collector.add_domain_info(target, {"ip": _ip}) except: pass else: collector.add_domain_info(target, {"ip": hostname}) # 需要启动那些poc进行目标信息扫描 work_list = [webeye.poc, webtitle.poc, wappalyzer.poc] # password_found.poc if IS_START_PLUGINS: pass work_list.append(crossdomain.poc) # work_list.append(directory_browse.poc) work_list.append(gitleak.poc) work_list.append(iis_parse.poc) work_list.append(phpinfo.poc) work_list.append(svnleak.poc) work_list.append(tomcat_leak.poc) # work_list.append(whatcms.poc) # 信息直接从函数的内部利用collector进行存储 for func in work_list: try: func(target) except Exception as e: logger.error("domain plugin threading error {}:{}".format( repr(Exception), str(e))) pass logger.debug("target:{} End of scan".format(target)) collector.print_domains() infos = collector.get_domain(target) _pocs = [] temp = {} if IS_START_PLUGINS and "CMS" in infos: if infos.get("app"): temp["app"] = [] temp["app"].append(infos["CMS"]) else: temp["app"] = [infos["CMS"]] # update domain app collector.add_domain_info(target, temp) if temp.get("app"): keywords = temp["app"] # 远程读取插件 pocs = load_remote_poc() for poc in pocs: for keyword in keywords: webfile = poc["webfile"] logger.debug("load {0} poc:{1} poc_time:{2}".format( poc["type"], webfile, poc["time"])) # 加载插件 加载远程文件目录 将其转换成实体 code = requests.get(webfile).text obj = load_string_to_moudle(code, webfile) # 在模块对象列表中加入远程模块 _pocs.append(obj) # 并发执行插件 if _pocs: executor = futures.ThreadPoolExecutor(len(_pocs)) fs = [] for f in _pocs: taks = executor.submit(f.poc, target) # 这儿返回的是啥子鸡巴啊 每个线程的控制类? fs.append(taks) for f in futures.as_completed(fs): try: res = f.result() except Exception as e: res = None logger.error("load poc error:{} error:{}".format( target, str(e))) if res: name = res.get("name") or "scan_" + str(time.time()) collector.add_domain_bug(target, {name: res}) # 通过异步调用插件得到返回结果,并且通过collector返送结果 collector.send_ok(target) print("print collector") print(collector.collect_domains)
def _work(self): while True: self.load_lock.acquire() if len(self.targets) > 0 and self.is_continue: subdomain = self.targets.popleft() self.load_lock.release() try: logger.debug("Curling %s..." %(subdomain)) flag = False codes = ['utf-8', 'gbk'] for pro in ['http://', "https://"]: url = pro + subdomain + '/' headers = { 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'DNT': '1', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' } res = self._curl(url,headers = headers) if res != None: try: length = int(res.headers['content-length']) except: length = len(str(res.headers)) + len(res.text) soup = BeautifulSoup(res.text, "html5lib") if soup !=None: title = soup.title if title == None or title.string == None or title.string == '': title = "网页没有标题".encode('utf-8') else: if res.encoding!= None: title = title.string.encode(res.encoding) codes.append(res.encoding) else: title = title.string codes.append(self.type_code) for j in range(0, len(codes)): try: title = title.decode(codes[j]).strip().replace("\r", "").replace("\n", "") break except: continue finally: if j + 1 == len(codes): title = '网页标题编码错误' self.ret.append([subdomain, url, title, res.status_code, length]) flag = True break else: title = '网页没有标题' self.ret.append( [subdomain, url, title, res.status_code,length]) if not flag: self.ret.append([subdomain, "", "", 0, 0]) except Exception: self.errmsg = traceback.format_exc() self.is_continue = False logger.error(self.errmsg) else: self.load_lock.release() break
class SpiderControl: ''' 爬虫控制类 ''' def __init__(self): ''' 初始化 self.url 根url self.deep 爬取深度 self.db 数据库操作类 self._thread 线程池 ''' logger.info('init control class') self.url = conf['url'] self.deep = conf['deep'] self.db = operate['db'] self._thread = ThreadPool(conf['thread'], self.get_html) def run(self): ''' 主控方法 :return: None ''' logger.info("start spider, and the spider deep is " + str(self.deep)) self.url_group = [] self.r_group = [] self.recursion_deep() logger.info("The spider page total number is : " + str(len(self.url_group))) self._thread._del() logger.info("Spider OVER!!") def recursion_deep(self): ''' 根据深度值进行爬取 operate['db'].deep 当前深度 self.deep 需要爬取的深度 :return: ''' if operate['db'].deep == 0: logger.info("spidering deep == 0 page") r = self.get_html(self.url) try: html = r['html'] except: print "url input error!" logger.error("url error(%s)" % (self.url)) return operate['db'].insert(html, self.url) self.r_group.append(r) operate['db'].deep += 1 self.recursion_deep() elif operate['db'].deep > self.deep: logger.info('spider deep over!') return else: logger.info("spidering deep = %s" % operate['db'].deep) tmp = [] url_group = [] # 从上一个deep爬取的页面中提取url for x in self.r_group: html = x['html'] url_group.extend(self.find_url(html)) logger.debug("from %s page find %s url" % (x['url'], len(url_group))) # 当页面没匹配出任何url, 则结束退出 if url_group == []: return # 把提取出来的url丢入线程池中 result_list = self._thread.my_map(url_group) for y in xrange(len(result_list)): if result_list[y]['type'] == 'html': tmp.append(result_list[y]) else: logger.debug("delete the not html page (%s)" % url_group[y]) self.r_group = tmp operate['db'].deep += 1 self.recursion_deep() def find_url(self, html): ''' 使用BeautifulSoup找出网页中的url :param html: html页面 :return: 返回一个list, 其值为html中url PS: 暂只考虑a标签中的href属性中的url ''' url_group = [] logger.debug("start find url in a html") try: bs = BeautifulSoup(html, 'lxml') except Exception, e: logger.error("bs4(html) fail!\nthe error info is : " + str(e)) return comp = re.compile("^https?://[/\w\.-]*/?[\w&\+%=-]*") for x in bs.findAll('a'): try: if comp.match(x['href']): logger.debug("%s match suc" % x['href']) if x['href'] not in self.url_group: url_group.append(x['href']) except KeyError: logger.debug(str(x) + " | <match href fail>") continue logger.debug("find %s url" % (len(url_group))) self.url_group.extend(url_group) return url_group
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0" } result = {"type": None} logger.info("request a url: %s" % url) try: req = requests.get(url, headers=header, timeout=4) except Exception, e: try: logger.error("%s @@ requests fail and the info is %s" % (url.encode('utf-8'), e)) except: print url print isinstance(url, unicode) return result if 'text/html' in req.headers['Content-Type']: logger.debug("get a html page: " + url) result['type'] = 'html' result['html'] = req.text result['url'] = url elif 'text/javascript' in req.headers['Content-Type']: logger.debug("get a js page: " + url) result['type'] = 'js' result['html'] = req.text result['url'] = url else: logger.warn("the page is not a html or a js(" + url + ")") return result
header = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0" } result = {"type": None} logger.info("request a url: %s" %url) try: req = requests.get(url, headers=header, timeout=4) except Exception, e: try: logger.error("%s @@ requests fail and the info is %s" %(url.encode('utf-8'), e)) except: print url print isinstance(url, unicode) return result if 'text/html' in req.headers['Content-Type']: logger.debug("get a html page: " + url) result['type'] = 'html' result['html'] = req.text result['url'] = url elif 'text/javascript' in req.headers['Content-Type']: logger.debug("get a js page: " + url) result['type'] = 'js' result['html'] = req.text result['url'] = url else: logger.warn("the page is not a html or a js("+url+")") return result
def replace_subdomain_status(self, subdomain,url,title,status,len,update_time,mon_domain): logger.debug("Replace subdomain: %s" % (subdomain)) self.execute( "REPLACE into subdomain (subdomain, url,title,status,len,update_time,mon_domain) VALUES (?, ?, ?, ?, ?, ?, ?) ",(subdomain, url, title, status, len, update_time,mon_domain))
def update_subdomain_status(self, subdomain,url,title,status,len,update_time): logger.debug("Update subdomain: %s" % (subdomain)) self.execute("UPDATE subdomain set url = ?, title = ?, status = ?, len = ?, update_time = ? WHERE subdomain = ? and ( title != ? or status != ?)",(url,title,status,len,update_time,subdomain,title,status))
def insert_subdomain(self, subdomain,url,title,status,len,update_time,mon_domain): logger.debug("Insert subdomain: %s" % (subdomain)) self.execute("INSERT OR IGNORE INTO subdomain (subdomain,url,title,status,len,update_time,mon_domain) VALUES (?, ?, ?, ?, ?, ?, ?)",(subdomain,url,title,status,len,update_time,mon_domain))
def hand_domain(self, serviceType): target = serviceType["target"] logger.info(target) # 添加这条记录 collector.add_domain(target) # 发起请求 try: r = requests.get(target, timeout=30, verify=False, allow_redirects=False) collector.add_domain_info(target, { "headers": r.headers, "body": r.text, "status_code": r.status_code }) except Exception as e: logger.error("request url error:" + str(e)) collector.del_domain(target) return logger.debug("target:{} over,start to scan".format(target)) # Get hostname hostname = urlparse(target).netloc.split(":")[0] if not is_ip_address_format(hostname): try: _ip = socket.gethostbyname(hostname) collector.add_domain_info(target, {"ip": _ip}) except: pass else: collector.add_domain_info(target, {"ip": hostname}) work_list = [ webeye.poc, webtitle.poc, wappalyzer.poc, password_found.poc ] if IS_START_PLUGINS: work_list.append(crossdomain.poc) work_list.append(directory_browse.poc) work_list.append(gitleak.poc) work_list.append(iis_parse.poc) work_list.append(phpinfo.poc) work_list.append(svnleak.poc) work_list.append(tomcat_leak.poc) work_list.append(whatcms.poc) # WorkList.append(bakfile.poc) # 去除备份文件扫描模块,原因:太费时 # th = [] # try: # for func in work_list: # i = threading.Thread(target=func, args=(target,)) # i.start() # th.append(i) # for thi in th: # thi.join() # except Exception as e: # logger.error("domain plugin threading error {}:{}".format(repr(Exception), str(e))) for func in work_list: try: func(target) except Exception as e: logger.error("domain plugin threading error {}:{}".format( repr(Exception), str(e))) logger.debug("target:{} End of scan".format(target)) infos = collector.get_domain(target) _pocs = [] temp = {} if IS_START_PLUGINS and "CMS" in infos: if infos.get("app"): temp["app"] = [] temp["app"].append(infos["CMS"]) else: temp["app"] = [infos["CMS"]] # update domain app collector.add_domain_info(target, temp) if temp.get("app"): keywords = temp["app"] # 远程读取插件 pocs = load_remote_poc() for poc in pocs: for keyword in keywords: if poc["name"] == keyword: webfile = poc["webfile"] logger.debug("load {0} poc:{1} poc_time:{2}".format( poc["type"], webfile, poc["time"])) # 加载插件 code = requests.get(webfile).text obj = load_string_to_module(code, webfile) _pocs.append(obj) # 并发执行插件 if _pocs: executor = futures.ThreadPoolExecutor(len(_pocs)) fs = [] for f in _pocs: taks = executor.submit(f.poc, target) fs.append(taks) for f in futures.as_completed(fs): try: res = f.result() except Exception as e: res = None logger.error("load poc error:{} error:{}".format( target, str(e))) if res: name = res.get("name") or "scan_" + str(time.time()) collector.add_domain_bug(target, {name: res}) collector.send_ok(target)
#!/usr/bin/env python # -*- coding:utf-8 -*- from lib.data import conf from lib import control from lib.data import logger from lib.data import operate from lib.options import oparser from lib.database import SpiderDb __author__ = "Hcamael" if __name__ == '__main__': logger.debug("Begin Spider") oparser() operate['db'] = SpiderDb(conf['dbfile']) c = control.SpiderControl() c.run()