def judgeUrlCount(self): if self.max_url_num == 0: return if self.urlcount < self.max_url_num: self.urlcount += 1 else: self.stop = True DEBUG("now urlcount:%s,kill pool start" % self.urlcount) self.pool.kill() DEBUG("kill pool end")
def run(self): DEBUG("ScanEngine start") joinall([ #spawn(self.scheduleDomain), spawn(self.scheduleUrl), spawn(self.scheduleDomain) ]) self.pool.join() self.update_progress('END') DEBUG("ScanEngine end")
def start(cls): url = conf.url basepath = conf.base concurrency = 10 depth = conf.depth urlcount = conf.count duplicates = 1 assert url urls = list(url) if isinstance(url, (list, tuple)) else [url] schedule = Schedule(urls, concurrency, depth, urlcount, duplicates, basepath) DEBUG('CrawlEngine start') schedule.doSchedule() DEBUG('CrawlEngine end')
def doSchedule(self): DEBUG("Schedule start") self.task.update_spider_flag('start') while not self.stop and (len(self.pool) > 0 or not self.pendings.empty()): try: request = self.pendings.get(block=False) except queue.Empty: gevent.sleep(0) else: self.pool.spawn(Spider.start, request, self) self.task.update_spider_flag('finish') code = (self.stop, self.urlcount, len(self.pool), self.pendings.qsize()) DEBUG( "Schedule end,stop:%s,now urlcount:%s,:pool size:%s,pendings size:%s" % code)
def run_url(req, rule): def _contains(content, chars): content = re.sub(r"\\[%s]" % "".join(chars), "", content, re.S) if chars else content return all(char in content for char in chars) details = [] response = None params = req.params for match in PARAMS_PATTERN.finditer(params): found = False prefix, suffix = [ "".join(random.sample(string.ascii_lowercase, PREFIX_SUFFIX_LENGTH)) for i in xrange(2) ] for pool in (LARGER_CHAR_POOL, SMALLER_CHAR_POOL): if not found: tampered = params.replace( match.group('value'), "%s%s%s%s" % (match.group('value'), prefix, "".join( random.sample(pool, len(pool))), suffix)) res = requestUrl(req, tampered) if not res: continue content = res.text for sample in re.finditer("%s(.+?)%s" % (prefix, suffix), content, re.I | re.S): for regex, condition, info in XSS_PATTERNS: context = re.search( regex % dict((("chars", reduce( lambda filtered, char: filtered. replace(char, "\\%s" % char), REGEX_SPECIAL_CHARS, sample.group(0))), )), content, re.I | re.S) if context and not found and sample.group(1).strip(): #print sample.group(1),condition if _contains(sample.group(1), condition): msg = info % dict((("filtering", "no" if all( char in sample.group(1) for char in LARGER_CHAR_POOL) else "some"), )) DEBUG(msg) found = True if response is None: response = res details.append(u"漏洞参数:%s" % match.group('key')) break #end for #end for #end for if response is not None: return Result(response, details)
def scheduleUrl(self): """ run_type为1,脚本需定义run_url方法 """ DEBUG("scheduleUrl start") sql = "SELECT `rule_id`,`risk`,`file_name` FROM `%s` WHERE `run_type` = 1 ORDER BY `priority`" % RULE_TABLE # rules = [] # for rule in db.iter(sql): # rules.append((str(rule.rule_id), rule.file_name, rule.risk)) rules = [(str(rule.rule_id), rule.file_name, rule.risk) for rule in db.iter(sql) if str(rule.rule_id) not in self.finished_progress] if not conf.spider_finish: #spider not finished, start crawler CrawlEngine.start() sql = "SELECT `url`,`method`,`params`,`referer` FROM %s WHERE `task_id`=%s" % ( URL_TABLE, self.task_id) # reqs = [] # for url in db.iter(sql): # reqs.append(Url(url.url, url.method, url.params, url.referer)) reqs = [ Url(url.url, url.method, url.params, url.referer) for url in db.iter(sql) ] for rule_id, filename, risk in rules: run_url = attr_from_script(filename, RUN_URL_DEFAULT_FUN) if run_url: DEBUG("rule_id:%s filename:%s run_url start" % (rule_id, filename)) for req in reqs: self.pool.spawn(self.runUrl, rule_id, run_url, req, filename, risk) gevent.sleep(0) DEBUG("rule_id:%s filename:%s run_url end" % (rule_id, filename)) DEBUG("scheduleUrl end")
def scheduleDomain(self): """ run_type为2,脚本需定义run_domain方法 """ DEBUG("scheduleDomain start") sql = "SELECT `rule_id`,`risk`,`file_name` FROM `%s` WHERE `run_type` = 2 ORDER BY `priority`" % RULE_TABLE # domainRule = [] # for rule in db.iter(sql): # domainRule.append((str(rule.rule_id), rule.file_name, rule.risk)) domainRule = [(str(rule.rule_id), rule.file_name, rule.risk) for rule in db.iter(sql) if str(rule.rule_id) not in self.finished_progress] for rule_id, filename, risk in domainRule: run_domain = attr_from_script(filename, RUN_DOMAIN_DEFAULT_FUN) if run_domain: DEBUG("rule_id:%s filename:%s run_domain start" % (rule_id, filename)) self.pool.spawn(self.runDomain, rule_id, run_domain, filename, risk) gevent.sleep(0) DEBUG("rule_id:%s filename:%s run_domain end" % (rule_id, filename)) DEBUG("scheduleDomain end")
def init_request(self): urls = self.task.get_exist_url() self.urlcount += len(urls) for url in urls: request = Request(url.url, url.method, url.params, url.referer) if self.visited[request] < self.duplicates: if not discard(request.url) and not url.end_time: request.id = url.id self.pendings.put(request) DEBUG("-----request:%s not crawler,add queue" % request) self.visited[request] += 1 else: #DEBUG("duplicates url:%s" %request) pass return self.urlcount
def __call__(self, parser, namespace, values, option_string=None): # 如果参数使用了该Action,会自动调用__call__方法,从而调用DEBUG函数 setattr(namespace, self.dest, True) DEBUG()