def __init__(self, db_file, num_threads, request_types, process_timeout, scanner_exe, display_progress, scanner_argv): self.scan_start_time = int(time.time()) self.threads = [] self._th_lock = threading.Lock() self._th_lock_db = threading.Lock() self.performed_requests = 0 self._urlpatterns = [] self._exitcode = 0 self.scanner_name = self.__class__.__name__.lower() self._running = False self.settings = self.get_settings() #override default settings if num_threads: self.settings['num_threads'] = num_threads if request_types: self.settings['request_types'] = request_types if process_timeout: self.settings['process_timeout'] = process_timeout if scanner_exe: self.settings['scanner_exe'] = scanner_exe self.settings['scanner_exe'] = self.settings['scanner_exe'].split(" ") self.db = Database(db_file) self.id_assessment = self.db.create_assessment(self.scanner_name, int(time.time())) self.pending_requests = self.db.get_requests( self.settings['request_types']) self.tot_requests = len(self.pending_requests) self._duplicated_requests = [] urlpatterns = [] for req in self.pending_requests: patt = RequestPattern(req).pattern if patt in urlpatterns: self._duplicated_requests.append(req.db_id) else: urlpatterns.append(patt) init = self.init(scanner_argv if scanner_argv else []) self._running = True print "Scanner %s started with %d threads" % ( self.scanner_name, self.settings['num_threads']) for n in range(0, self.settings['num_threads']): thread = self.Executor(self) self.threads.append(thread) thread.start() try: self.wait_executor(self.threads, display_progress) except KeyboardInterrupt: print "\nTerminated by user" self.kill_threads() self.save_assessment() sys.exit(self._exitcode)
def __init__(self, db_file, num_threads, request_types, display_progress, scanner_argv, proxy, cookies, user_agent, extra_headers): self.scan_start_time = int(time.time()) self.threads = [] self.lock = threading.Lock() self._th_lock = threading.Lock() self._th_lock_db = threading.Lock() self._th_lock_stdout = threading.Lock() self.performed_requests = 0 self._urlpatterns = [] self._exitcode = 0 self._commands = [] self.scanner_name = self.__class__.__name__.lower() self._running = False self.settings = self.get_settings() #self._type = self.settings['scanner_type'] if 'scanner_type' in self.settings else "external" self.exit_requested = False self.pause_requested = False self._print_queue = {} self.display_progress = display_progress #override default settings if num_threads: self.settings['num_threads'] = num_threads if request_types: self.settings['request_types'] = request_types #if process_timeout: self.settings['process_timeout'] = process_timeout #if scanner_exe: self.settings['scanner_exe'] = scanner_exe # if self._type == "external": # self.settings['scanner_exe'] = self.settings['scanner_exe'].split(" ") self._db = Database(db_file) self.id_assessment = self._db.create_assessment( self.scanner_name, int(time.time())) self.pending_requests = self._db.get_requests( self.settings['request_types']) self.tot_requests = len(self.pending_requests) self._duplicated_requests = [] self.proxy = proxy self.cookies = cookies self.user_agent = user_agent self.extra_headers = extra_headers self.utils = ScannerUtils(self) urlpatterns = [] for req in self.pending_requests: patt = RequestPattern(req).pattern if patt in urlpatterns: self._duplicated_requests.append(req.db_id) else: urlpatterns.append(patt) init = self.init(scanner_argv if scanner_argv else []) # if self._type == "external" and not os.path.isfile(self.settings['scanner_exe'][0]): # raise Exception("scanner_exe not found") self._running = True print( "Scanner %s started with %d threads (^C to pause or change verbosity)" % (self.scanner_name, self.settings['num_threads'])) for n in range(0, self.settings['num_threads']): thread = self.Executor(self) self.threads.append(thread) thread.start() self.wait_executor(self.threads) if not self.wait_threads_exit(): self._th_lock.acquire() for cmd in self._commands: if cmd: cmd.kill() self._th_lock.release() os._exit(1) self.end() self.save_assessment()
def main_loop(self, threads, start_requests, database): pending = len(start_requests) crawled = 0 pb = Progressbar(self.crawl_start_time, "pages processed") req_to_crawl = start_requests while True: try: if self.display_progress and not self.verbose: tot = (crawled + pending) pb.out(tot, crawled) if pending == 0: # is the check of running threads really needed? running_threads = [ t for t in threads if t.status == THSTAT_RUNNING ] if len(running_threads) == 0: if self.display_progress or self.verbose: print("") break if len(req_to_crawl) > 0: Shared.th_condition.acquire() Shared.requests.extend(req_to_crawl) Shared.th_condition.notifyAll() Shared.th_condition.release() req_to_crawl = [] Shared.main_condition.acquire() Shared.main_condition.wait(1) if len(Shared.crawl_results) > 0: database.connect() database.begin() for result in Shared.crawl_results: crawled += 1 pending -= 1 if self.verbose: print("crawl result for: %s " % result.request) if len(result.request.user_output) > 0: print(" user: %s" % json.dumps(result.request.user_output)) if result.errors: print("* crawler errors: %s" % ", ".join(result.errors)) database.save_crawl_result(result, True) if Shared.options['deduplicate_pages']: if self.request_is_duplicated(result.page_hash): filtered_requests = [] for r in result.found_requests: if RequestPattern( r ).pattern not in self.request_patterns: filtered_requests.append(r) result.found_requests = filtered_requests if self.verbose: print( " * marked as duplicated ... requests filtered" ) self.page_hashes.append(result.page_hash) for r in result.found_requests: self.request_patterns.append( RequestPattern(r).pattern) for req in result.found_requests: database.save_request(req) if self.verbose and req not in Shared.requests and req not in req_to_crawl: print(" new request found %s" % req) if request_is_crawlable( req ) and req not in Shared.requests and req not in req_to_crawl: if request_depth(req) > Shared.options[ 'max_depth'] or request_post_depth( req ) > Shared.options['max_post_depth']: if self.verbose: print( " * cannot crawl: %s : crawl depth limit reached" % req) result = CrawlResult( req, errors=[ERROR_CRAWLDEPTH]) database.save_crawl_result(result, False) continue if req.redirects > Shared.options[ 'max_redirects']: if self.verbose: print( " * cannot crawl: %s : too many redirects" % req) result = CrawlResult( req, errors=[ERROR_MAXREDIRECTS]) database.save_crawl_result(result, False) continue pending += 1 req_to_crawl.append(req) Shared.crawl_results = [] database.commit() database.close() Shared.main_condition.release() except KeyboardInterrupt: try: Shared.main_condition.release() Shared.th_condition.release() except: pass self.pause_threads(threads, True) if not self.get_runtime_command(): print("Exiting . . .") return print("Crawler is running") self.pause_threads(threads, False)