def run(self): while self._max_executions > 0: # Stop if our master died. if self._ppid != os.getppid(): break # Our master asked us to stop. We must obey. if self._stop_event.is_set(): break job = self._job_queue.get_job_nowait() if job is None: time.sleep(1) continue start_info = (job.scan_id, job.scan_module.name, datetime.today(), job.num_tries) self._notify_master('job_started', start_info) result = Result(job.current_result, NoOpFileHandler()) logger = logging.Logger(job.scan_module.name) logger.addHandler( WorkerWritePipeHandler(self._pid, self._write_pipe)) logger.addHandler(ScanStreamHandler()) scan_meta = ScanMeta(worker_id=self._id, num_tries=job.num_tries) with tempfile.TemporaryDirectory() as temp_dir: old_cwd = os.getcwd() os.chdir(temp_dir) try: job.scan_module.scan_site(result, logger, job.options, scan_meta) except RetryScan: self._job_queue.report_failure() self._notify_master('job_failed', (datetime.today(), )) except RescheduleLater as e: self._job_queue.reschedule(e.not_before) self._job_queue.report_result(result.get_updates()) self._notify_master('job_finished', (datetime.today(), )) except Exception: logger.exception('Scan module `{}` failed.'.format( job.scan_module.name)) self._job_queue.report_failure() self._notify_master('job_failed', (datetime.today(), )) if self._raven_client: self._raven_client.captureException( tags={ 'scan_id': job.scan_id, 'scan_module_name': job.scan_module.name }, extra={'result': result.get_results()}) else: self._job_queue.report_result(result.get_updates()) self._notify_master('job_finished', (datetime.today(), )) finally: os.chdir(old_cwd) kill_everything(self._pid, only_children=True) self._max_executions -= 1 kill_everything(self._pid)
def update_dependencies(args): config = load_config(args.config) scan_modules = load_modules(config['SCAN_MODULES'], config['SCAN_MODULE_OPTIONS']) updated = [] stream_handler = ScanStreamHandler() for scan_module in scan_modules.values(): logger = logging.Logger(scan_module.name) logger.addHandler(stream_handler) if hasattr(scan_module, 'update_dependencies'): logger.info('Updating dependencies') scan_module.update_dependencies() updated.append(scan_module.name) if updated: print('\nUpdated dependencies of: ' + ' '.join(updated)) else: print('\nNothing to update.')
def scan_site(args): config = load_config(args.config) _require_dependencies(config) site_parsed = urlparse(args.site) if site_parsed.scheme not in ('http', 'https'): raise CommandError('Invalid site: {}'.format(args.site)) results_dir = args.results if results_dir is None: results_dir = slugify(site_parsed.netloc) + '_' results_dir += hashlib.sha512(args.site.encode()).hexdigest()[:10] results_dir = Path(results_dir).resolve() try: results_dir.mkdir(exist_ok=True) except IOError as e: raise CommandError( 'Could not create results directory: {}'.format(e)) from e result_file = results_dir / 'results.json' result_json = {'site_url': args.site} if args.import_results: try: with open(args.import_results) as f: import_json = json.load(f) except IOError as e: raise CommandError( 'Could not open result JSON: {}'.format(e)) from e except ValueError as e: raise CommandError( 'Could not parse result JSON: {}'.format(e)) from e else: result_json.update(import_json) try: with result_file.open('w') as f: json.dump(result_json, f, indent=2) f.write('\n') except IOError as e: raise CommandError('Could not write result JSON: {}'.format(e)) from e scan_modules = load_modules(config['SCAN_MODULES'], config['SCAN_MODULE_OPTIONS']) scan_module_names = args.scan_modules if scan_module_names is None: scan_module_names = scan_modules.keys() # Order scan_module_names by dependency topologically dependencies = {} for scan_module_name in scan_module_names: mod = scan_modules[scan_module_name] dependencies[mod.name] = set(mod.dependencies) scan_module_names = toposort_flatten(dependencies) if args.skip_dependencies: scan_module_names = [ scan_module_name for scan_module_name in scan_module_names if scan_module_name in args.scan_modules ] has_error = False result = Result(result_json, DirectoryFileHandler(results_dir)) stream_handler = ScanStreamHandler() logs_dir = results_dir / 'logs' logs_dir.mkdir(exist_ok=True) lock_dir = config['STORAGE_PATH'] / 'locks' lock_dir.mkdir(exist_ok=True) scan_queue = [ QueueEntry(mod_name, 0, None) for mod_name in scan_module_names ] scan_queue.reverse() while scan_queue: scan_module_name, num_try, not_before = scan_queue.pop() if not_before is not None: # noinspection PyTypeChecker while datetime.utcnow() < not_before: time.sleep(0.5) mod = scan_modules[scan_module_name] num_try += 1 log_filename = (logs_dir / (mod.name + '.log')) file_handler = ScanFileHandler(str(log_filename)) logger = logging.Logger(mod.name) logger.addHandler(stream_handler) logger.addHandler(file_handler) with tempfile.TemporaryDirectory() as temp_dir: old_cwd = os.getcwd() os.chdir(temp_dir) logger.info('Starting %s', mod.name) try: with NumericLock(lock_dir) as worker_id: scan_meta = ScanMeta(worker_id=worker_id, num_tries=num_try) mod.logger = logger mod.scan_site(result, scan_meta) except RetryScan: if num_try <= config['MAX_TRIES']: scan_queue.append( QueueEntry(scan_module_name, num_try, not_before)) logger.info('Scan module `%s` will be retried', mod.name) else: has_error = True except RescheduleLater as e: scan_queue.append( QueueEntry(scan_module_name, num_try, e.not_before)) except Exception: if num_try <= config['MAX_TRIES']: scan_queue.append( QueueEntry(scan_module_name, num_try, not_before)) has_error = True logger.exception('Scan module `%s` failed.', mod.name) finally: os.chdir(old_cwd) with result_file.open('w') as f: json.dump(result.get_results(), f, indent=2, sort_keys=True) f.write('\n') logger.info('Finished %s', mod.name) pprint.pprint(result.get_results()) if has_error: sys.exit(1)