class Robot(): def __init__(self): GPIO.cleanup() self.speed_left = speed.Speed(40, "speed-left") self.speed_right = speed.Speed(38, "speed-right") self.move = move.Car(31, 33, 35, 37) self.robotrun = True self.movequeue = Queue() def run(self): self.speed_left.start() self.speed_right.start() self.move.start() while robotrun: try: command = raw_input("Command: ") self.movequeue.put(command) except KeyboardInterrupt: self.stop() self.stop() return def stop(self): self.speed_left.stop() self.speed_right.stop() self.move.stop() GPIO.cleanup() robotrun = False return
def __init__(self, manager_params, browser_params): self.manager_params = manager_params self.browser_params = browser_params self.logger = loggingclient(*manager_params['logger_address']) self.listener_address = None self.listener_process = None self.status_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received = None
def test(): NUMBER_OF_PROCESSES = 4 TASKS1 = [(mul, (i, 7)) for i in range(20)] TASKS2 = [(plus, (i, 8)) for i in range(10)] # Create queues task_queue = Queue() done_queue = Queue() # Submit tasks map(task_queue.put, TASKS1) # Start worker processes for i in range(NUMBER_OF_PROCESSES): Process(target=worker, args=(task_queue, done_queue)).start() # Get and print results print "Unordered results:" for i in range(len(TASKS1)): print "\t", done_queue.get() # Add more tasks using `put()` instead of `putMany()` for task in TASKS2: task_queue.put(task) # Get and print some more results for i in range(len(TASKS2)): print "\t", done_queue.get() # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): task_queue.put("STOP")
def __init__(self): GPIO.cleanup() self.speed_left = speed.Speed(40, "speed-left") self.speed_right = speed.Speed(38, "speed-right") self.move = move.Car(31, 33, 35, 37) self.robotrun = True self.movequeue = Queue()
def _launch_loggingserver(self): """ sets up logging server """ self.logging_status_queue = Queue() loggingserver = Process(target=MPLogger.loggingserver, args=(self.manager_params['log_file'], self.logging_status_queue, )) loggingserver.daemon = True loggingserver.start() return loggingserver
def main(): t1 = task("http://www.laurentluce.com/posts/python-threads-synchronization-locks-rlocks-semaphores-conditions-events-and-queues/") t2 = task("http://stackoverflow.com/questions/15651128/in-this-semaphore-example-is-it-necessary-to-lock-for-refill-and-buy") t3 = task("http://bbs.byr.cn/") event = Event() tasks = Queue() pages = Queue() tasks.put(t1) tasks.put(t2) tasks.put(t3) f = open("test.txt",'a') Crawler1 = Crawler(tasks,pages,event,f) Crawler2 = Crawler(tasks,pages,event,f) Crawler1.start() Crawler2.start() Crawler1.join() Crawler2.join() f.close()
def _launch_aggregators(self): """ Launches the various data aggregators, which serialize data from all processes. * DataAggregator - sqlite database for crawl data * LevelDBAggregator - leveldb database for javascript files """ # DataAggregator self.aggregator_status_queue = Queue() self.data_aggregator = Process(target=DataAggregator.DataAggregator, args=(self.manager_params, self.aggregator_status_queue)) self.data_aggregator.daemon = True self.data_aggregator.start() self.manager_params['aggregator_address'] = self.aggregator_status_queue.get() # socket location: (address, port) # LevelDB Aggregator if self.ldb_enabled: self.ldb_status_queue = Queue() self.ldb_aggregator = Process(target=LevelDBAggregator.LevelDBAggregator, args=(self.manager_params, self.ldb_status_queue)) self.ldb_aggregator.daemon = True self.ldb_aggregator.start() self.manager_params['ldb_address'] = self.ldb_status_queue.get() # socket location: (address, port)
def mp_process(self, nprocs, func, *args): images = args[0] out_q = Queue() chunksize = int(math.ceil((len(images) - 1) / float(nprocs))) procs = [] print("Chunks of size:", chunksize) for i in range(nprocs): if i == nprocs - 1: p = Process(target=worker, args=(images[chunksize * i:len(images)], i, chunksize, out_q, func, *args)) procs.append(p) p.start() self.loading.progress2['value'] += chunksize self.update() else: p = Process(target=worker, args=(images[chunksize * i:chunksize * (i + 1) + 1], i, chunksize, out_q, func, *args)) procs.append(p) p.start() self.loading.progress2['value'] += chunksize self.update() # Collect all results into a single result dict. We know how many dicts # with results to expect. resultdict = {} for i in range(nprocs): resultdict.update(out_q.get()) # Wait for all worker processes to finish for p in procs: p.join() results = [] for j in range(len(resultdict)): results.append(resultdict[j]) return results
def _launch_aggregators(self): """Launch the necessary data aggregators""" self.data_aggregator = SqliteAggregator.SqliteAggregator( self.manager_params, self.browser_params) self.data_aggregator.launch() self.manager_params[ 'aggregator_address'] = self.data_aggregator.listener_address # open connection to aggregator for saving crawl details self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address']) # TODO refactor ldb aggregator to use new base classes if self.ldb_enabled: self.ldb_status_queue = Queue() self.ldb_aggregator = Process( target=LevelDBAggregator.LevelDBAggregator, args=(self.manager_params, self.ldb_status_queue)) self.ldb_aggregator.daemon = True self.ldb_aggregator.start() # socket location: (address, port) self.manager_params['ldb_address'] = self.ldb_status_queue.get()
def test_s3_callbacks(self): TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager_params, browser_params = self.get_config() dataset = LocalS3Dataset(manager_params["s3_bucket"], manager_params["s3_directory"]) manager = TaskManager.TaskManager(manager_params, browser_params) queue = Queue() def ensure_site_in_s3(success: bool): # Ensure http table is created queue.put(TEST_SITE in dataset.load_table( "http_requests").top_level_url.unique()) sequence = CommandSequence(TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert queue.get()
def _work(cls, input_queue: mp.Queue, transformer: Callable, output_queue: mp.Queue): try: for output in transformer(iterate_until_none(input_queue.get)): output_queue.put(output) except BaseException as e: output_queue.put(e) output_queue.put(None)
def run_parallely_with_progress_bar( items, func, msgfunc, accumulator=NoOpAcc(), title=''): PROC_COUNT = 5 total = len(items) task_queue = Queue() done_queue = Queue() def pb_updater(inq, results_q): pb = ProgressBar(title) for i in range(total): msg, result = results_q.get() accumulator.add(result) pb.update(percent=((i+1)*100)/total, message=msg) pb.finish() accumulator.finish() # tell the workers to stop for i in range(PROC_COUNT): inq.put('STOP') def worker(inq, outq): for item in iter(inq.get, 'STOP'): result = func(item) outq.put((msgfunc(item), result)) for i in range(PROC_COUNT): Process(target=worker, args=(task_queue, done_queue)).start() updater = Process(target=pb_updater, args=(task_queue, done_queue)) updater.start() for item in items: task_queue.put(item)
def test_worker(self): tournament = axelrod.Tournament( name=self.test_name, players=self.players, game=self.game, turns=200, repetitions=self.test_repetitions) work_queue = Queue() for repetition in range(self.test_repetitions): work_queue.put(repetition) work_queue.put('STOP') done_queue = Queue() tournament._worker(work_queue, done_queue) for r in range(self.test_repetitions): new_matches = done_queue.get() self.assertEqual(len(new_matches), 15) for index_pair, match in new_matches.items(): self.assertIsInstance(index_pair, tuple) self.assertIsInstance(match, list) queue_stop = done_queue.get() self.assertEqual(queue_stop, 'STOP')
def __init__(self, name, data_sampler_method, cache_path=None, save_every=50): """ Initializes a DataStreamRecorder Parameters ---------- name : string User-friendly identifier for this data stream data_sampler_method : function Method to call to retrieve data """ Process.__init__(self) self._data_sampler_method = data_sampler_method self._has_set_sampler_params = False self._recording = False self._name = name self._cmds_q = Queue() self._data_qs = [Queue()] self._ok_q = None self._tokens_q = None self._save_every = save_every self._cache_path = cache_path self._saving_cache = cache_path is not None if self._saving_cache: self._save_path = os.path.join(cache_path, self.name) if not os.path.exists(self._save_path): os.makedirs(self._save_path) self._start_data_segment = 0 self._cur_data_segment = 0 self._saving_ps = []
def __init__(self, manager_params, browser_params): self.manager_params = manager_params self.browser_params = browser_params self.listener_address = None self.listener_process = None self.status_queue = Queue() self.completion_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received = None self.logger = logging.getLogger("openwpm")
def run(self): """Starts the run of the tests""" results = [] worker_list = [] to_worker = Queue() from_worker = Queue() verbose = self.cl_args.verbose failfast = self.cl_args.failfast workers = int(not self.cl_args.parallel) or self.cl_args.workers for suite in self.suites: to_worker.put(suite) for _ in range(workers): to_worker.put(None) start = time.time() # A second try catch is needed here because queues can cause locking # when they go out of scope, especially when termination signals used try: for _ in range(workers): proc = Consumer(to_worker, from_worker, verbose, failfast) worker_list.append(proc) proc.start() for _ in self.suites: results.append(self.log_result(from_worker.get())) end = time.time() tests_run, errors, failures = self.compile_results( run_time=end - start, datagen_time=start - self.datagen_start, results=results) except KeyboardInterrupt: print_exception("Runner", "run", "Keyboard Interrupt, exiting...") os.killpg(0, 9) return bool(sum([errors, failures, not tests_run]))
def test_process_done_queue(self): workers = 2 done_queue = Queue() matches = [] tournament = axelrod.Tournament( name=self.test_name, players=self.players, game=self.game, turns=200, repetitions=self.test_repetitions) for r in range(self.test_repetitions): done_queue.put({}) for w in range(workers): done_queue.put('STOP') tournament._process_done_queue(workers, done_queue, matches) self.assertEqual(len(matches), self.test_repetitions)
def test_start_workers(self): workers = 2 work_queue = Queue() done_queue = Queue() for repetition in range(self.test_repetitions): work_queue.put(repetition) tournament = axelrod.Tournament( name=self.test_name, players=self.players, game=self.game, turns=200, repetitions=self.test_repetitions) tournament._start_workers(workers, work_queue, done_queue) stops = 0 while stops < workers: payoffs = done_queue.get() if payoffs == 'STOP': stops += 1 self.assertEqual(stops, workers)
def _run(self, idx: int, mapper_queue: mp.Queue, collector_queue: typing.Optional[mp.Queue]): """ Process's activity. It handles queue IO and invokes user's mapper handler. (subprocess, blocked, only two queues can be used to communicate with main process) """ with self.mapper(idx) as mapper: while True: data = mapper_queue.get() if data[0] == ParallelProcessor.CMD_STOP: # print(idx, 'stop') self._update_progress(mapper, finish=True) if self.collector and collector_queue is not None: collector_queue.put((ParallelProcessor.CMD_STOP, )) return elif data[0] == ParallelProcessor.CMD_DATA: batch_result = [] for d in data[1]: args, kwargs = d[0], d[1] # print(idx, 'data') self._update_progress(mapper, type_=ProgressThread.P_LOADED) result = mapper.process(*args, **kwargs) self._update_progress(mapper, type_=ProgressThread.P_PROCESSED) if collector_queue is not None: if self.collector: if not isinstance( result, tuple ): # collector must represent as tuple result = (result, ) batch_result.append(result) if collector_queue is not None and len(batch_result) > 0: collector_queue.put( (ParallelProcessor.CMD_DATA, batch_result)) batch_result = [] # reset buffer
def __init__( self, structured_storage: StructuredStorageProvider, unstructured_storage: Optional[UnstructuredStorageProvider], ) -> None: self.listener_address: Optional[Tuple[str, int]] = None self.listener_process: Optional[Process] = None self.status_queue = Queue() self.completion_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received: Optional[float] = None self.logger = logging.getLogger("openwpm") self.storage_controller = StorageController( structured_storage, unstructured_storage, status_queue=self.status_queue, completion_queue=self.completion_queue, shutdown_queue=self.shutdown_queue, )
def launch_browser_manager(self): """ sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid. loads associated user profile if necessary """ # Unsupported. See https://github.com/mozilla/OpenWPM/projects/2 # if this is restarting from a crash, update the tar location # to be a tar of the crashed browser's history """ if self.current_profile_path is not None: # tar contents of crashed profile to a temp dir tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/" profile_commands.dump_profile( self.current_profile_path, self.manager_params, self.browser_params, tempdir, close_webdriver=False, browser_settings=self.browser_settings ) # make sure browser loads crashed profile self.browser_params['profile_tar'] = tempdir # don't re-randomize attributes self.browser_params['random_attributes'] = False crash_recovery = True else: """ self.logger.info("BROWSER %i: Launching browser..." % self.crawl_id) tempdir = None crash_recovery = False self.is_fresh = not crash_recovery # Try to spawn the browser within the timelimit unsuccessful_spawns = 0 success = False def check_queue(launch_status): result = self.status_queue.get(True, self._SPAWN_TIMEOUT) if result[0] == 'STATUS': launch_status[result[1]] = True return result[2] elif result[0] == 'CRITICAL': raise pickle.loads(result[1]) elif result[0] == 'FAILED': raise BrowserCrashError( 'Browser spawn returned failure status') while not success and \ unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT: self.logger.debug("BROWSER %i: Spawn attempt %i " % (self.crawl_id, unsuccessful_spawns)) # Resets the command/status queues (self.command_queue, self.status_queue) = (Queue(), Queue()) # builds and launches the browser_manager args = (self.command_queue, self.status_queue, self.browser_params, self.manager_params, crash_recovery) self.browser_manager = Process(target=BrowserManager, args=args) self.browser_manager.daemon = True self.browser_manager.start() # Read success status of browser manager launch_status = dict() try: # 1. Selenium profile created spawned_profile_path = check_queue(launch_status) # 2. Profile tar loaded (if necessary) check_queue(launch_status) # 3. Display launched (if necessary) self.display_pid, self.display_port = check_queue( launch_status) # 4. Browser launch attempted check_queue(launch_status) # 5. Browser launched (self.browser_pid, self.browser_settings) = check_queue(launch_status) (driver_profile_path, ready) = check_queue(launch_status) if ready != 'READY': self.logger.error( "BROWSER %i: Mismatch of status queue return values, " "trying again..." % self.crawl_id) unsuccessful_spawns += 1 continue success = True except (EmptyQueue, BrowserCrashError): unsuccessful_spawns += 1 error_string = '' status_strings = [ 'Proxy Ready', 'Profile Created', 'Profile Tar', 'Display', 'Launch Attempted', 'Browser Launched', 'Browser Ready' ] for string in status_strings: error_string += " | %s: %s " % ( string, launch_status.get(string, False)) self.logger.error("BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id, error_string)) self.close_browser_manager() if 'Profile Created' in launch_status: shutil.rmtree(spawned_profile_path, ignore_errors=True) # If the browser spawned successfully, we should update the # current profile path class variable and clean up the tempdir # and previous profile path. if success: self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.crawl_id) previous_profile_path = self.current_profile_path self.current_profile_path = driver_profile_path if driver_profile_path != spawned_profile_path: shutil.rmtree(spawned_profile_path, ignore_errors=True) if previous_profile_path is not None: shutil.rmtree(previous_profile_path, ignore_errors=True) if tempdir is not None: shutil.rmtree(tempdir, ignore_errors=True) return success
class TaskManager: """ User-facing Class for interfacing with OpenWPM The TaskManager spawns several child processes to run the automation tasks. - DataAggregator to aggregate data in a SQLite database - MPLogger to aggregate logs across processes - BrowserManager processes to isolate Browsers in a separate process <manager_params> dict of TaskManager configuration parameters <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list. NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately. """ def __init__(self, manager_params, browser_params, process_watchdog=False): # Make paths absolute in manager_params for path in ['data_directory','log_directory']: if manager_params[path] is not None: manager_params[path] = os.path.expanduser(manager_params[path]) manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name']) manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file']) manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots') manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources') self.manager_params = manager_params # Create data directories if they do not exist if not os.path.exists(manager_params['screenshot_path']): os.makedirs(manager_params['screenshot_path']) if not os.path.exists(manager_params['source_dump_path']): os.makedirs(manager_params['source_dump_path']) # check size of parameter dictionary self.num_browsers = manager_params['num_browsers'] if len(browser_params) != self.num_browsers: raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']") # Flow control self.closing = False self.failure_status = None self.threadlock = threading.Lock() self.failurecount = 0 if manager_params['failure_limit'] is not None: self.failure_limit = manager_params['failure_limit'] else: self.failure_limit = self.num_browsers * 2 + 10 self.process_watchdog = process_watchdog # sets up the crawl data database db_path = manager_params['database_name'] if not os.path.exists(manager_params['data_directory']): os.mkdir(manager_params['data_directory']) self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) self.db.commit() # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() # socket location: (address, port) self.manager_params['logger_address'] = self.logging_status_queue.get() self.logger = MPLogger.loggingclient(*self.manager_params['logger_address']) # Mark if LDBAggregator is needed (if js is enabled on any browser) self.ldb_enabled = False for params in browser_params: if params['save_javascript'] or params['save_javascript_proxy']: self.ldb_enabled = True break # Initialize the data aggregators self._launch_aggregators() # open client socket self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address']) self._save_configuration(browser_params) # read the last used site visit id cur = self.db.cursor() cur.execute("SELECT MAX(visit_id) from site_visits") last_visit_id = cur.fetchone()[0] if last_visit_id is None: last_visit_id = 0 self.next_visit_id = last_visit_id + 1 # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) # List of the Browser(s) self._launch_browsers() # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start() def _save_configuration(self, browser_params): """ Saves crawl configuration details to db and logfile""" cur = self.db.cursor() # Get git version and commit information openwpm_v, browser_v = get_version() # Record task details cur.execute(("INSERT INTO task " "(manager_params, openwpm_version, browser_version) " "VALUES (?,?,?)"), (json.dumps(self.manager_params), openwpm_v, browser_v)) self.db.commit() self.task_id = cur.lastrowid # Record browser details for each brower for i in xrange(self.num_browsers): cur.execute("INSERT INTO crawl (task_id, browser_params) VALUES (?,?)", (self.task_id, json.dumps(browser_params[i]))) self.db.commit() browser_params[i]['crawl_id'] = cur.lastrowid # Print the configuration details self.logger.info(get_configuration_string(self.manager_params, browser_params, (openwpm_v, browser_v))) def _initialize_browsers(self, browser_params): """ initialize the browser classes, each its unique set of parameters """ browsers = list() for i in xrange(self.num_browsers): browsers.append(Browser(self.manager_params, browser_params[i])) return browsers def _launch_browsers(self): """ launch each browser manager process / browser """ for browser in self.browsers: try: success = browser.launch_browser_manager() except: self._cleanup_before_fail(during_init=True) raise if not success: self.logger.critical("Browser spawn failure during TaskManager initialization, exiting...") self.close() break # Update our DB with the random browser settings # These are found within the scope of each instance of Browser in the browsers list screen_res = str(browser.browser_settings['screen_res']) ua_string = str(browser.browser_settings['ua_string']) self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \ WHERE crawl_id = ?", (screen_res, ua_string, browser.crawl_id))) def _manager_watchdog(self): """ Periodically checks the following: - memory consumption of all browsers every 10 seconds - presence of processes that are no longer in use """ while not self.closing: time.sleep(10) # Check browser memory usage for browser in self.browsers: try: process = psutil.Process(browser.browser_pid) mem = process.memory_info()[0] / float(2 ** 20) if mem > BROWSER_MEMORY_LIMIT: self.logger.info("BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB" % (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT)) browser.restart_required = True except psutil.NoSuchProcess: pass # Check for browsers or displays that were not closed correctly # Provide a 300 second buffer to avoid killing freshly launched browsers # TODO This buffer should correspond to the maximum browser spawn timeout if self.process_watchdog: browser_pids = set() display_pids = set() check_time = time.time() for browser in self.browsers: if browser.browser_pid is not None: browser_pids.add(browser.browser_pid) if browser.display_pid is not None: display_pids.add(browser.display_pid) for process in psutil.process_iter(): if (process.create_time() + 300 < check_time and ((process.name() == 'firefox' and process.pid not in browser_pids) or (process.name() == 'Xvfb' and process.pid not in display_pids))): self.logger.debug("Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing." % (process.name(), process.pid, process.create_time())) process.kill() def _launch_aggregators(self): """ Launches the various data aggregators, which serialize data from all processes. * DataAggregator - sqlite database for crawl data * LevelDBAggregator - leveldb database for javascript files """ # DataAggregator self.aggregator_status_queue = Queue() self.data_aggregator = Process(target=DataAggregator.DataAggregator, args=(self.manager_params, self.aggregator_status_queue)) self.data_aggregator.daemon = True self.data_aggregator.start() self.manager_params['aggregator_address'] = self.aggregator_status_queue.get() # socket location: (address, port) # LevelDB Aggregator if self.ldb_enabled: self.ldb_status_queue = Queue() self.ldb_aggregator = Process(target=LevelDBAggregator.LevelDBAggregator, args=(self.manager_params, self.ldb_status_queue)) self.ldb_aggregator.daemon = True self.ldb_aggregator.start() self.manager_params['ldb_address'] = self.ldb_status_queue.get() # socket location: (address, port) def _kill_aggregators(self): """ Terminates the aggregators gracefully """ # DataAggregator self.logger.debug("Telling the DataAggregator to shut down...") self.aggregator_status_queue.put("DIE") start_time = time.time() self.data_aggregator.join(300) self.logger.debug("DataAggregator took " + str(time.time() - start_time) + " seconds to close") # LevelDB Aggregator if self.ldb_enabled: self.logger.debug("Telling the LevelDBAggregator to shut down...") self.ldb_status_queue.put("DIE") start_time = time.time() self.ldb_aggregator.join(300) self.logger.debug("LevelDBAggregator took " + str(time.time() - start_time) + " seconds to close") def _launch_loggingserver(self): """ sets up logging server """ self.logging_status_queue = Queue() loggingserver = Process(target=MPLogger.loggingserver, args=(self.manager_params['log_file'], self.logging_status_queue, )) loggingserver.daemon = True loggingserver.start() return loggingserver def _kill_loggingserver(self): """ terminates logging server gracefully """ self.logging_status_queue.put("DIE") self.loggingserver.join(300) def _shutdown_manager(self, failure=False, during_init=False): """ Wait for current commands to finish, close all child processes and threads <failure> flag to indicate manager failure (True) or end of crawl (False) <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization """ self.closing = True for browser in self.browsers: browser.shutdown_browser(during_init) if failure: self.sock.send(("UPDATE crawl SET finished = -1 WHERE crawl_id = ?", (browser.crawl_id,))) else: self.sock.send(("UPDATE crawl SET finished = 1 WHERE crawl_id = ?", (browser.crawl_id,))) self.db.close() # close db connection self.sock.close() # close socket to data aggregator self._kill_aggregators() self._kill_loggingserver() def _cleanup_before_fail(self, during_init=False): """ Execute shutdown commands before throwing an exception This should keep us from having a bunch of hanging processes and incomplete data. <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization """ self._shutdown_manager(failure=True, during_init=during_init) def _check_failure_status(self): """ Check the status of command failures. Raise exceptions as necessary The failure status property is used by the various asynchronous command execution threads which interface with the remote browser manager processes. If a failure status is found, the appropriate steps are taken to gracefully close the infrastructure """ self.logger.debug("Checking command failure status indicator...") if self.failure_status: self.logger.debug("TaskManager failure status set, halting command execution.") self._cleanup_before_fail() if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit': raise CommandExecutionError( "TaskManager exceeded maximum consecutive command " "execution failures.", self.failure_status['CommandSequence'] ) elif self.failure_status['ErrorType'] == 'ExceedLaunchFailureLimit': raise CommandExecutionError( "TaskManager failed to launch browser within allowable " "failure limit.", self.failure_status['CommandSequence'] ) if self.failure_status['ErrorType'] == 'CriticalChildException': reraise(*cPickle.loads(self.failure_status['Exception'])) # CRAWLER COMMAND CODE def _distribute_command(self, command_sequence, index=None): """ parses command type and issues command(s) to the proper browser <index> specifies the type of command this is: = None -> first come, first serve = # -> index of browser to send command to = * -> sends command to all browsers = ** -> sends command to all browsers (synchronized) """ if index is None: #send to first browser available command_executed = False while True: for browser in self.browsers: if browser.ready(): browser.current_timeout = command_sequence.total_timeout thread = self._start_thread(browser, command_sequence) command_executed = True break if command_executed: break time.sleep(SLEEP_CONS) elif 0 <= index < len(self.browsers): #send the command to this specific browser while True: if self.browsers[index].ready(): self.browsers[index].current_timeout = command_sequence.total_timeout thread = self._start_thread(self.browsers[index], command_sequence) break time.sleep(SLEEP_CONS) elif index == '*': #send the command to all browsers command_executed = [False] * len(self.browsers) while False in command_executed: for i in xrange(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: self.browsers[i].current_timeout = command_sequence.total_timeout thread = self._start_thread(self.browsers[i], command_sequence) command_executed[i] = True time.sleep(SLEEP_CONS) elif index == '**': #send the command to all browsers and sync it condition = threading.Condition() # Used to block threads until ready command_executed = [False] * len(self.browsers) while False in command_executed: for i in xrange(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: self.browsers[i].current_timeout = command_sequence.total_timeout thread = self._start_thread(self.browsers[i], command_sequence, condition) command_executed[i] = True time.sleep(SLEEP_CONS) with condition: condition.notifyAll() # All browsers loaded, tell them to start else: self.logger.info("Command index type is not supported or out of range") return if command_sequence.blocking: thread.join() self._check_failure_status() def _start_thread(self, browser, command_sequence, condition=None): """ starts the command execution thread """ # Check status flags before starting thread if self.closing: self.logger.error("Attempted to execute command on a closed TaskManager") return self._check_failure_status() browser.set_visit_id(self.next_visit_id) self.sock.send(("INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)", (self.next_visit_id, browser.crawl_id, command_sequence.url))) self.next_visit_id += 1 # Start command execution thread args = (browser, command_sequence, condition) thread = threading.Thread(target=self._issue_command, args=args) browser.command_thread = thread thread.daemon = True thread.start() return thread def _issue_command(self, browser, command_sequence, condition=None): """ sends command tuple to the BrowserManager """ browser.is_fresh = False # since we are issuing a command, the BrowserManager is no longer a fresh instance # if this is a synced call, block on condition if condition is not None: with condition: condition.wait() reset = command_sequence.reset start_time = None # tracks when a site visit started, so that flash/profile # cookies can be properly tracked. for command_and_timeout in command_sequence.commands_with_timeout: command, timeout = command_and_timeout if command[0] in ['GET', 'BROWSE']: start_time = time.time() command += (browser.curr_visit_id,) elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']: command += (start_time, browser.curr_visit_id,) browser.current_timeout = timeout # passes off command and waits for a success (or failure signal) browser.command_queue.put(command) command_succeeded = 0 #1 success, 0 failure from error, -1 timeout command_arguments = command[1] if len(command) > 1 else None # received reply from BrowserManager, either success signal or failure notice try: status = browser.status_queue.get(True, browser.current_timeout) if status == "OK": command_succeeded = 1 elif status[0] == "CRITICAL": self.logger.critical("BROWSER %i: Received critical error " "from browser process while executing " "command %s. Setting failure status." % ( browser.crawl_id, str(command))) self.failure_status = { 'ErrorType': 'CriticalChildException', 'CommandSequence': command_sequence, 'Exception': status[1] } return else: command_succeeded = 0 self.logger.info("BROWSER %i: Received failure status while" " executing command: %s" % (browser.crawl_id, command[0])) except EmptyQueue: command_succeeded = -1 self.logger.info("BROWSER %i: Timeout while executing command, " "%s, killing browser manager" % (browser.crawl_id, command[0])) self.sock.send(("INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)" " VALUES (?,?,?,?)", (browser.crawl_id, command[0], command_arguments, command_succeeded))) if command_succeeded != 1: with self.threadlock: self.failurecount += 1 if self.failurecount > self.failure_limit: self.logger.critical("BROWSER %i: Command execution failure" " pushes failure count above the allowable limit." " Setting failure_status." % browser.crawl_id) self.failure_status = { 'ErrorType': 'ExceedCommandFailureLimit', 'CommandSequence': command_sequence } return browser.restart_required = True else: with self.threadlock: self.failurecount = 0 if browser.restart_required: break # Sleep after executing CommandSequence to provide extra time for # internal buffers to drain. Stopgap in support of #135 time.sleep(2) if self.closing: return if browser.restart_required or reset: success = browser.restart_browser_manager(clear_profile = reset) if not success: self.logger.critical("BROWSER %i: Exceeded the maximum allowable " "consecutive browser launch failures. " "Setting failure_status." % browser.crawl_id) self.failure_status = { 'ErrorType': 'ExceedLaunchFailureLimit', 'CommandSequence': command_sequence } return browser.restart_required = False def execute_command_sequence(self, command_sequence, index=None): self._distribute_command(command_sequence, index) # DEFINITIONS OF HIGH LEVEL COMMANDS # NOTE: These wrappers are provided for convenience. To issue sequential # commands to the same browser in a single 'visit', use the CommandSequence # class directly. def get(self, url, index=None, timeout=60, sleep=0, reset=False): """ goes to a url """ command_sequence = CommandSequence.CommandSequence(url) command_sequence.get(timeout=timeout, sleep=sleep) command_sequence.reset = reset self.execute_command_sequence(command_sequence, index=index) def browse(self, url, num_links=2, sleep=0, index=None, timeout=60, reset=False): """ browse a website and visit <num_links> links on the page """ command_sequence = CommandSequence.CommandSequence(url) command_sequence.browse(num_links=num_links, sleep=sleep, timeout=timeout) command_sequence.reset = reset self.execute_command_sequence(command_sequence, index=index) def close(self): """ Execute shutdown procedure for TaskManager """ if self.closing: self.logger.error("TaskManager already closed") return self._shutdown_manager()
class TaskManager: """ User-facing Class for interfacing with OpenWPM The TaskManager spawns several child processes to run the automation tasks. - DataAggregator to aggregate data in a SQLite database - MPLogger to aggregate logs across processes - BrowserManager processes to isolate Browsers in a separate process <manager_params> dict of TaskManager configuration parameters <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list. NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately. """ def __init__(self, manager_params, browser_params, process_watchdog=False): # Make paths absolute in manager_params for path in ['data_directory', 'log_directory']: if manager_params[path] is not None: manager_params[path] = os.path.expanduser(manager_params[path]) manager_params['database_name'] = os.path.join( manager_params['data_directory'], manager_params['database_name']) manager_params['log_file'] = os.path.join( manager_params['log_directory'], manager_params['log_file']) manager_params['screenshot_path'] = os.path.join( manager_params['data_directory'], 'screenshots') manager_params['source_dump_path'] = os.path.join( manager_params['data_directory'], 'sources') self.manager_params = manager_params self.browser_params = browser_params # Create data directories if they do not exist if not os.path.exists(manager_params['screenshot_path']): os.makedirs(manager_params['screenshot_path']) if not os.path.exists(manager_params['source_dump_path']): os.makedirs(manager_params['source_dump_path']) # check size of parameter dictionary self.num_browsers = manager_params['num_browsers'] if len(browser_params) != self.num_browsers: raise Exception("Number of <browser_params> dicts is not the same " "as manager_params['num_browsers']") # Flow control self.closing = False self.failure_status = None self.threadlock = threading.Lock() self.failurecount = 0 if manager_params['failure_limit'] is not None: self.failure_limit = manager_params['failure_limit'] else: self.failure_limit = self.num_browsers * 2 + 10 self.process_watchdog = process_watchdog # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() # socket location: (address, port) self.manager_params['logger_address'] = self.logging_status_queue.get() self.logger = MPLogger.loggingclient( *self.manager_params['logger_address']) # Initialize the data aggregators self._launch_aggregators() # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) self._launch_browsers() # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start() # Save crawl config information to database openwpm_v, browser_v = get_version() self.data_aggregator.save_configuration(openwpm_v, browser_v) self.logger.info( get_configuration_string( self.manager_params, browser_params, (openwpm_v, browser_v) ) ) def _initialize_browsers(self, browser_params): """ initialize the browser classes, each its unique set of params """ browsers = list() for i in range(self.num_browsers): browser_params[i][ 'crawl_id'] = self.data_aggregator.get_next_crawl_id() browsers.append(Browser(self.manager_params, browser_params[i])) return browsers def _launch_browsers(self): """ launch each browser manager process / browser """ for browser in self.browsers: try: success = browser.launch_browser_manager() except Exception: self._cleanup_before_fail(during_init=True) raise if not success: self.logger.critical("Browser spawn failure during " "TaskManager initialization, exiting...") self.close() break def _manager_watchdog(self): """ Periodically checks the following: - memory consumption of all browsers every 10 seconds - presence of processes that are no longer in use TODO: process watchdog needs to be updated since `psutil` won't kill browser processes started by Selenium 3 (with `subprocess`) """ if self.process_watchdog: self.logger.error("BROWSER %i: Process watchdog is not currently " "supported." % self.crawl_id) while not self.closing: time.sleep(10) # Check browser memory usage for browser in self.browsers: try: process = psutil.Process(browser.browser_pid) mem = process.memory_info()[0] / float(2 ** 20) if mem > BROWSER_MEMORY_LIMIT: self.logger.info("BROWSER %i: Memory usage: %iMB" ", exceeding limit of %iMB" % ( browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT)) browser.restart_required = True except psutil.NoSuchProcess: pass # Check for browsers or displays that were not closed correctly # 300 second buffer to avoid killing freshly launched browsers # TODO This buffer should correspond to the maximum spawn timeout if self.process_watchdog: browser_pids = set() display_pids = set() check_time = time.time() for browser in self.browsers: if browser.browser_pid is not None: browser_pids.add(browser.browser_pid) if browser.display_pid is not None: display_pids.add(browser.display_pid) for process in psutil.process_iter(): if (process.create_time() + 300 < check_time and ( (process.name() == 'firefox' and process.pid not in browser_pids) or (process.name() == 'Xvfb' and process.pid not in display_pids))): self.logger.debug("Process: %s (pid: %i) with start " "time %s found running but not in " "browser process list. Killing." % ( process.name(), process.pid, process.create_time())) process.kill() def _launch_aggregators(self): """Launch the necessary data aggregators""" if self.manager_params["output_format"] == "local": self.data_aggregator = LocalAggregator.LocalAggregator( self.manager_params, self.browser_params) elif self.manager_params["output_format"] == "s3": self.data_aggregator = S3Aggregator.S3Aggregator( self.manager_params, self.browser_params) else: raise Exception("Unrecognized output format: %s" % self.manager_params["output_format"]) self.data_aggregator.launch() self.manager_params[ 'aggregator_address'] = self.data_aggregator.listener_address # open connection to aggregator for saving crawl details self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address']) def _kill_aggregators(self): """Shutdown any currently running data aggregators""" self.data_aggregator.shutdown() def _launch_loggingserver(self): """ sets up logging server """ self.logging_status_queue = Queue() loggingserver = Process(target=MPLogger.loggingserver, args=(self.manager_params['log_file'], self.logging_status_queue, )) loggingserver.daemon = True loggingserver.start() return loggingserver def _kill_loggingserver(self): """ terminates logging server gracefully """ self.logging_status_queue.put("DIE") self.loggingserver.join(300) def _shutdown_manager(self, during_init=False): """ Wait for current commands to finish, close all child processes and threads <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization """ self.closing = True for browser in self.browsers: browser.shutdown_browser(during_init) self.sock.close() # close socket to data aggregator self._kill_aggregators() self._kill_loggingserver() def _cleanup_before_fail(self, during_init=False): """ Execute shutdown commands before throwing an exception This should keep us from having a bunch of hanging processes and incomplete data. <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization """ self._shutdown_manager(during_init=during_init) def _check_failure_status(self): """ Check the status of command failures. Raise exceptions as necessary The failure status property is used by the various asynchronous command execution threads which interface with the remote browser manager processes. If a failure status is found, the appropriate steps are taken to gracefully close the infrastructure """ self.logger.debug("Checking command failure status indicator...") if self.failure_status: self.logger.debug( "TaskManager failure status set, halting command execution.") self._cleanup_before_fail() if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit': raise CommandExecutionError( "TaskManager exceeded maximum consecutive command " "execution failures.", self.failure_status['CommandSequence'] ) elif (self.failure_status['ErrorType'] == ("ExceedLaunch" "FailureLimit")): raise CommandExecutionError( "TaskManager failed to launch browser within allowable " "failure limit.", self.failure_status['CommandSequence'] ) if self.failure_status['ErrorType'] == 'CriticalChildException': reraise(*pickle.loads(self.failure_status['Exception'])) # CRAWLER COMMAND CODE def _distribute_command(self, command_seq, index=None): """ parses command type and issues command(s) to the proper browser <index> specifies the type of command this is: = None -> first come, first serve = # -> index of browser to send command to = * -> sends command to all browsers = ** -> sends command to all browsers (synchronized) """ # Block if the aggregator queue is too large agg_queue_size = self.data_aggregator.get_most_recent_status() if agg_queue_size >= AGGREGATOR_QUEUE_LIMIT: while agg_queue_size >= AGGREGATOR_QUEUE_LIMIT: self.logger.info( "Blocking command submission until the DataAggregator " "is below the max queue size of %d. Current queue " "length %d. " % (AGGREGATOR_QUEUE_LIMIT, agg_queue_size) ) agg_queue_size = self.data_aggregator.get_status() # Distribute command if index is None: # send to first browser available command_executed = False while True: for browser in self.browsers: if browser.ready(): browser.current_timeout = command_seq.total_timeout thread = self._start_thread(browser, command_seq) command_executed = True break if command_executed: break time.sleep(SLEEP_CONS) elif index == '*': # send the command to all browsers command_executed = [False] * len(self.browsers) while False in command_executed: for i in range(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: self.browsers[ i].current_timeout = command_seq.total_timeout thread = self._start_thread( self.browsers[i], command_seq) command_executed[i] = True time.sleep(SLEEP_CONS) elif index == '**': # send the command to all browsers and sync it condition = threading.Condition() # block threads until ready command_executed = [False] * len(self.browsers) while False in command_executed: for i in range(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: self.browsers[ i].current_timeout = command_seq.total_timeout thread = self._start_thread( self.browsers[i], command_seq, condition) command_executed[i] = True time.sleep(SLEEP_CONS) with condition: condition.notifyAll() # All browsers loaded, start elif 0 <= index < len(self.browsers): # send the command to this specific browser while True: if self.browsers[index].ready(): self.browsers[ index].current_timeout = command_seq.total_timeout thread = self._start_thread( self.browsers[index], command_seq) break time.sleep(SLEEP_CONS) else: self.logger.info( "Command index type is not supported or out of range") return if command_seq.blocking: thread.join() self._check_failure_status() def _start_thread(self, browser, command_sequence, condition=None): """ starts the command execution thread """ # Check status flags before starting thread if self.closing: self.logger.error( "Attempted to execute command on a closed TaskManager") return self._check_failure_status() browser.set_visit_id(self.data_aggregator.get_next_visit_id()) self.sock.send(("site_visits", { "visit_id": browser.curr_visit_id, "crawl_id": browser.crawl_id, "site_url": command_sequence.url })) # Start command execution thread args = (browser, command_sequence, condition) thread = threading.Thread(target=self._issue_command, args=args) browser.command_thread = thread thread.daemon = True thread.start() return thread def _issue_command(self, browser, command_sequence, condition=None): """ sends command tuple to the BrowserManager """ browser.is_fresh = False # if this is a synced call, block on condition if condition is not None: with condition: condition.wait() reset = command_sequence.reset start_time = None for command_and_timeout in command_sequence.commands_with_timeout: command, timeout = command_and_timeout if command[0] in ['GET', 'BROWSE', 'SAVE_SCREENSHOT', 'SCREENSHOT_FULL_PAGE', 'DUMP_PAGE_SOURCE', 'RECURSIVE_DUMP_PAGE_SOURCE']: start_time = time.time() command += (browser.curr_visit_id,) elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']: command += (start_time, browser.curr_visit_id,) browser.current_timeout = timeout # passes off command and waits for a success (or failure signal) browser.command_queue.put(command) command_succeeded = 0 # 1 success, 0 error, -1 timeout command_arguments = command[1] if len(command) > 1 else None # received reply from BrowserManager, either success or failure try: status = browser.status_queue.get( True, browser.current_timeout) if status == "OK": command_succeeded = 1 elif status[0] == "CRITICAL": self.logger.critical( "BROWSER %i: Received critical error from browser " "process while executing command %s. Setting failure " "status." % (browser.crawl_id, str(command))) self.failure_status = { 'ErrorType': 'CriticalChildException', 'CommandSequence': command_sequence, 'Exception': status[1] } return else: command_succeeded = 0 self.logger.info( "BROWSER %i: Received failure status while executing " "command: %s" % (browser.crawl_id, command[0])) except EmptyQueue: command_succeeded = -1 self.logger.info( "BROWSER %i: Timeout while executing command, %s, killing " "browser manager" % (browser.crawl_id, command[0])) self.sock.send(("crawl_history", { "crawl_id": browser.crawl_id, "visit_id": browser.curr_visit_id, "command": command[0], "arguments": command_arguments, "bool_success": command_succeeded })) if command_succeeded != 1: with self.threadlock: self.failurecount += 1 if self.failurecount > self.failure_limit: self.logger.critical( "BROWSER %i: Command execution failure pushes failure " "count above the allowable limit. Setting " "failure_status." % browser.crawl_id) self.failure_status = { 'ErrorType': 'ExceedCommandFailureLimit', 'CommandSequence': command_sequence } return browser.restart_required = True self.logger.debug("BROWSER %i: Browser restart required" % ( browser.crawl_id)) else: with self.threadlock: self.failurecount = 0 if browser.restart_required: break # Sleep after executing CommandSequence to provide extra time for # internal buffers to drain. Stopgap in support of #135 time.sleep(2) if self.closing: return if browser.restart_required or reset: success = browser.restart_browser_manager(clear_profile=reset) if not success: self.logger.critical( "BROWSER %i: Exceeded the maximum allowable consecutive " "browser launch failures. Setting failure_status." % ( browser.crawl_id)) self.failure_status = { 'ErrorType': 'ExceedLaunchFailureLimit', 'CommandSequence': command_sequence } return browser.restart_required = False def execute_command_sequence(self, command_sequence, index=None): self._distribute_command(command_sequence, index) # DEFINITIONS OF HIGH LEVEL COMMANDS # NOTE: These wrappers are provided for convenience. To issue sequential # commands to the same browser in a single 'visit', use the CommandSequence # class directly. def get(self, url, index=None, timeout=60, sleep=0, reset=False): """ goes to a url """ command_sequence = CommandSequence.CommandSequence(url) command_sequence.get(timeout=timeout, sleep=sleep) command_sequence.reset = reset self.execute_command_sequence(command_sequence, index=index) def browse(self, url, num_links=2, sleep=0, index=None, timeout=60, reset=False): """ browse a website and visit <num_links> links on the page """ command_sequence = CommandSequence.CommandSequence(url) command_sequence.browse( num_links=num_links, sleep=sleep, timeout=timeout) command_sequence.reset = reset self.execute_command_sequence(command_sequence, index=index) def close(self): """ Execute shutdown procedure for TaskManager """ if self.closing: self.logger.error("TaskManager already closed") return self._shutdown_manager()
def deploy_firefox( status_queue: Queue, browser_params: BrowserParamsInternal, manager_params: ManagerParamsInternal, crash_recovery: bool, ) -> Tuple[webdriver.Firefox, str, Optional[Display]]: """ launches a firefox instance with parameters set by the input dictionary """ firefox_binary_path = get_firefox_binary_path() root_dir = os.path.dirname(__file__) # directory of this file fp = FirefoxProfile() browser_profile_path = Path(fp.path) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the # Options method has no "frozen"/restricted options. # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039 fo = Options() assert browser_params.browser_id is not None if browser_params.seed_tar and not crash_recovery: logger.info("BROWSER %i: Loading initial browser profile from: %s" % (browser_params.browser_id, browser_params.seed_tar)) load_profile( browser_profile_path, manager_params, browser_params, browser_params.seed_tar, ) elif browser_params.recovery_tar: logger.debug("BROWSER %i: Loading recovered browser profile from: %s" % (browser_params.browser_id, browser_params.recovery_tar)) load_profile( browser_profile_path, manager_params, browser_params, browser_params.recovery_tar, ) status_queue.put(("STATUS", "Profile Tar", None)) display_mode = browser_params.display_mode display_pid = None display_port = None display = None if display_mode == "headless": fo.headless = True fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0])) fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1])) if display_mode == "xvfb": try: display = Display(visible=0, size=DEFAULT_SCREEN_RES) display.start() display_pid, display_port = display.pid, display.display except EasyProcessError: raise RuntimeError("Xvfb could not be started. \ Please ensure it's on your path. \ See www.X.org for full details. \ Commonly solved on ubuntu with `sudo apt install xvfb`") # Must do this for all display modes, # because status_queue is read off no matter what. status_queue.put(("STATUS", "Display", (display_pid, display_port))) if browser_params.extension_enabled: # Write config file extension_config: Dict[str, Any] = dict() extension_config.update(browser_params.to_dict()) extension_config["logger_address"] = manager_params.logger_address extension_config[ "storage_controller_address"] = manager_params.storage_controller_address extension_config["testing"] = manager_params.testing ext_config_file = browser_profile_path / "browser_params.json" with open(ext_config_file, "w") as f: json.dump(extension_config, f, cls=ConfigEncoder) logger.debug("BROWSER %i: Saved extension config file to: %s" % (browser_params.browser_id, ext_config_file)) # TODO restore detailed logging # fo.set_preference("*****@*****.**", "all") # Configure privacy settings configure_firefox.privacy(browser_params, fp, fo, root_dir, browser_profile_path) # Set various prefs to improve speed and eliminate traffic to Mozilla configure_firefox.optimize_prefs(fo) # Intercept logging at the Selenium level and redirect it to the # main logger. This will also inform us where the real profile # directory is hiding. interceptor = FirefoxLogInterceptor(browser_params.browser_id, browser_profile_path) interceptor.start() # Set custom prefs. These are set after all of the default prefs to allow # our defaults to be overwritten. for name, value in browser_params.prefs.items(): logger.info("BROWSER %i: Setting custom preference: %s = %s" % (browser_params.browser_id, name, value)) fo.set_preference(name, value) # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) fb = FirefoxBinary(firefox_path=firefox_binary_path) driver = webdriver.Firefox( firefox_profile=fp, firefox_binary=fb, firefox_options=fo, log_path=interceptor.fifo, ) # Add extension if browser_params.extension_enabled: # Install extension ext_loc = os.path.join(root_dir, "../Extension/firefox/openwpm.xpi") ext_loc = os.path.normpath(ext_loc) driver.install_addon(ext_loc, temporary=True) logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" % browser_params.browser_id) # set window size driver.set_window_size(*DEFAULT_SCREEN_RES) # Get browser process pid if hasattr(driver, "service") and hasattr(driver.service, "process"): pid = driver.service.process.pid elif hasattr(driver, "binary") and hasattr(driver.binary, "process"): pid = driver.binary.process.pid else: raise RuntimeError("Unable to identify Firefox process ID.") status_queue.put(("STATUS", "Browser Launched", int(pid))) return driver, driver.capabilities["moz:profile"], display
class DataStreamRecorder(Process): def __init__(self, name, data_sampler_method, cache_path=None, save_every=50): """ Initializes a DataStreamRecorder Parameters ---------- name : string User-friendly identifier for this data stream data_sampler_method : function Method to call to retrieve data """ Process.__init__(self) self._data_sampler_method = data_sampler_method self._has_set_sampler_params = False self._recording = False self._name = name self._cmds_q = Queue() self._data_qs = [Queue()] self._ok_q = None self._tokens_q = None self._save_every = save_every self._cache_path = cache_path self._saving_cache = cache_path is not None if self._saving_cache: self._save_path = os.path.join(cache_path, self.name) if not os.path.exists(self._save_path): os.makedirs(self._save_path) self._start_data_segment = 0 self._cur_data_segment = 0 self._saving_ps = [] def run(self): setproctitle('python.DataStreamRecorder.{0}'.format(self._name)) try: logging.debug("Starting data recording on {0}".format(self.name)) self._tokens_q.put(("return", self.name)) while True: if not self._cmds_q.empty(): cmd = self._cmds_q.get() if cmd[0] == 'stop': break elif cmd[0] == 'pause': self._recording = False if self._saving_cache: self._save_cache(self._cur_data_segment) self._cur_data_segment += 1 self._data_qs.append(Queue()) elif cmd[0] == 'reset_data_segment': self._start_data_segment = self._cur_data_segment elif cmd[0] == 'resume': self._recording = True elif cmd[0] == 'save': self._save_data(cmd[1], cmd[2], cmd[3]) elif cmd[0] == 'params': self._args = cmd[1] self._kwargs = cmd[2] if self._recording and not self._ok_q.empty(): timestamp = self._ok_q.get() self._tokens_q.put(("take", self.name)) data = self._data_sampler_method(*self._args, **self._kwargs) cur_data_q = self._data_qs[self._cur_data_segment] if self._saving_cache and cur_data_q.qsize( ) == self._save_every: self._save_cache(self._cur_data_segment) cur_data_q = Queue() self._data_qs.append(cur_data_q) self._cur_data_segment += 1 cur_data_q.put((timestamp, data)) self._tokens_q.put(("return", self.name)) except KeyboardInterrupt: logging.debug("Shutting down data streamer on {0}".format( self.name)) sys.exit(0) def _extract_q(self, i): q = self._data_qs[i] vals = [] while q.qsize() > 0: vals.append(q.get()) self._data_qs[i] = None del q return vals def _save_data(self, path, cb, concat): if not os.path.exists(path): os.makedirs(path) target_filename = os.path.join(path, "{0}.jb".format(self.name)) if self._saving_cache: while True in [p.is_alive() for p in self._saving_ps]: sleep(1e-3) p = Process(target=_caches_to_file, args=(self._save_path, self._start_data_segment, self._cur_data_segment, target_filename, cb, concat)) p.start() self._start_data_segment = self._cur_data_segment else: data = self._extract_q(0) p = Process(target=_dump, args=(data, target_filename, cb)) p.start() def _save_cache(self, i): if not self._save_cache: raise Exception( "Cannot save cache if no cache path was specified.") logging.debug("Saving cache for {0} block {1}".format( self.name, self._cur_data_segment)) data = self._extract_q(i) p = Process(target=_dump_cache, args=(data, os.path.join(self._save_path, "{0}.jb".format( self._cur_data_segment)), self.name, self._cur_data_segment)) p.start() self._saving_ps.append(p) def _start_recording(self, *args, **kwargs): """ Starts recording Parameters ---------- *args : any Ordinary args used for calling the specified data sampler method **kwargs : any Keyword args used for calling the specified data sampler method """ while not self._cmds_q.empty(): self._cmds_q.get_nowait() while not self._data_qs[self._cur_data_segment].empty(): self._data_qs[self._cur_data_segment].get_nowait() self._args = args self._kwargs = kwargs self._recording = True self.start() @property def name(self): return self._name def _set_qs(self, ok_q, tokens_q): self._ok_q = ok_q self._tokens_q = tokens_q def _flush(self): """ Returns a list of all current data """ if self._recording: raise Exception("Cannot flush data queue while recording!") if self._saving_cache: logging.warn( "Flush when using cache means unsaved data will be lost and not returned!" ) self._cmds_q.put(("reset_data_segment", )) else: data = self._extract_q(0) return data def save_data(self, path, cb=_NULL, concat=True): if self._recording: raise Exception("Cannot save data while recording!") self._cmds_q.put(("save", path, cb, concat)) def _stop(self): """ Stops recording. Returns all recorded data and their timestamps. Destroys recorder process.""" self._pause() self._cmds_q.put(("stop", )) try: self._recorder.terminate() except Exception: pass self._recording = False def _pause(self): """ Pauses recording """ self._cmds_q.put(("pause", )) self._recording = False def _resume(self): """ Resumes recording """ self._cmds_q.put(("resume", )) self._recording = True def change_data_sampler_params(self, *args, **kwargs): """ Chanes args and kwargs for data sampler method Parameters ---------- *args : any Ordinary args used for calling the specified data sampler method **kwargs : any Keyword args used for calling the specified data sampler method """ self._cmds_q.put(('params', args, kwargs))
def execute_parallel(farg_pairs, num_procs=None, verbose=False): # see https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ from multiprocess import Process, Queue, cpu_count if num_procs is None: # leave 25% num_procs = math.ceil(cpu_count() * .75) print "using %d procs in execute parallel" % num_procs processes = [] q = None results = [] q = Queue() num_jobs = len(farg_pairs) if verbose: print "execute_parallel num_procs=%d, num_jobs=%d" % (num_procs, num_jobs) i = -1 farg_pair = None farg_pairs = copy.copy(farg_pairs) while len(farg_pairs) > 0: farg_pair = farg_pairs.pop(0) i += 1 if verbose: print "running job", i def target_func(*args, **kwargs): q.put((i, farg_pair[0](*args, **kwargs))) if len(farg_pair) > 1: p = Process(target=target_func, args=farg_pair[1]) else: p = Process(target=target_func) p.start() processes.append(p) # wait until we drop below num_procs while len(processes) >= num_procs: len1 = len(results) results.append(q.get()) if len1 != len(results): for j, p in enumerate(processes): if p.exitcode is not None: p.join() break processes = processes[:j] + processes[j + 1:] else: time.sleep(0.01) while len(results) < num_jobs: results.append(q.get()) time.sleep(0.01) assert len(results) == num_jobs # join remaining processes before exiting for i, p in enumerate(processes): p.join() results = zip(*sorted(results, key=lambda x: x[0]))[1] return results
class TaskManager: """ User-facing Class for interfacing with OpenWPM The TaskManager spawns several child processes to run the automation tasks. - DataAggregator to aggregate data in a SQLite database - MPLogger to aggregate logs across processes - BrowserManager processes to isolate Browsers in a separate process <manager_params> dict of TaskManager configuration parameters <browser_params> is a list of (or a single) dictionaries that specify preferences for browsers to instantiate <process_watchdog> will monitor firefox and Xvfb processes, killing any not indexed in TaskManager's browser list. NOTE: Only run this in isolated environments. It kills processes by name, indiscriminately. """ def __init__(self, manager_params, browser_params, process_watchdog=False): # Make paths absolute in manager_params for path in ['data_directory', 'log_directory']: if manager_params[path] is not None: manager_params[path] = os.path.expanduser(manager_params[path]) manager_params['database_name'] = os.path.join( manager_params['data_directory'], manager_params['database_name']) manager_params['log_file'] = os.path.join( manager_params['log_directory'], manager_params['log_file']) manager_params['screenshot_path'] = os.path.join( manager_params['data_directory'], 'screenshots') manager_params['source_dump_path'] = os.path.join( manager_params['data_directory'], 'sources') self.manager_params = manager_params # Create data directories if they do not exist if not os.path.exists(manager_params['screenshot_path']): os.makedirs(manager_params['screenshot_path']) if not os.path.exists(manager_params['source_dump_path']): os.makedirs(manager_params['source_dump_path']) # check size of parameter dictionary self.num_browsers = manager_params['num_browsers'] if len(browser_params) != self.num_browsers: raise Exception( "Number of <browser_params> dicts is not the same as manager_params['num_browsers']" ) # Flow control self.closing = False self.failure_status = None self.threadlock = threading.Lock() self.failurecount = 0 if manager_params['failure_limit'] is not None: self.failure_limit = manager_params['failure_limit'] else: self.failure_limit = self.num_browsers * 2 + 10 self.process_watchdog = process_watchdog # sets up the crawl data database db_path = manager_params['database_name'] if not os.path.exists(manager_params['data_directory']): os.mkdir(manager_params['data_directory']) self.db = sqlite3.connect(db_path) with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f: self.db.executescript(f.read()) self.db.commit() # sets up logging server + connect a client self.logging_status_queue = None self.loggingserver = self._launch_loggingserver() # socket location: (address, port) self.manager_params['logger_address'] = self.logging_status_queue.get() self.logger = MPLogger.loggingclient( *self.manager_params['logger_address']) # Mark if LDBAggregator is needed (if js is enabled on any browser) self.ldb_enabled = False for params in browser_params: if params['save_javascript'] or params['save_javascript_proxy']: self.ldb_enabled = True break # Initialize the data aggregators self._launch_aggregators() # open client socket self.sock = clientsocket(serialization='dill') self.sock.connect(*self.manager_params['aggregator_address']) print 1 self._save_configuration(browser_params) print 2 # read the last used site visit id cur = self.db.cursor() cur.execute("SELECT MAX(visit_id) from site_visits") last_visit_id = cur.fetchone()[0] if last_visit_id is None: last_visit_id = 0 self.next_visit_id = last_visit_id + 1 print 3 # sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers( browser_params) # List of the Browser(s) print 5 self._launch_browsers() print 4 # start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.start() def _save_configuration(self, browser_params): """ Saves crawl configuration details to db and logfile""" cur = self.db.cursor() # Get git version and commit information openwpm_v, browser_v = get_version() # Record task details cur.execute(("INSERT INTO task " "(manager_params, openwpm_version, browser_version) " "VALUES (?,?,?)"), (json.dumps(self.manager_params), openwpm_v, browser_v)) self.db.commit() self.task_id = cur.lastrowid # Record browser details for each brower for i in xrange(self.num_browsers): cur.execute( "INSERT INTO crawl (task_id, browser_params) VALUES (?,?)", (self.task_id, json.dumps(browser_params[i]))) self.db.commit() browser_params[i]['crawl_id'] = cur.lastrowid # Print the configuration details self.logger.info( get_configuration_string(self.manager_params, browser_params, (openwpm_v, browser_v))) def _initialize_browsers(self, browser_params): """ initialize the browser classes, each its unique set of parameters """ browsers = list() for i in xrange(self.num_browsers): browsers.append(Browser(self.manager_params, browser_params[i])) return browsers def _launch_browsers(self): print 8 print self.browsers """ launch each browser manager process / browser """ for browser in self.browsers: try: print 9 success = browser.launch_browser_manager() print 6 except: print 7 self._cleanup_before_fail(during_init=True) raise if not success: self.logger.critical( "Browser spawn failure during TaskManager initialization, exiting..." ) self.close() break # Update our DB with the random browser settings # These are found within the scope of each instance of Browser in the browsers list screen_res = str(browser.browser_settings['screen_res']) ua_string = str(browser.browser_settings['ua_string']) self.sock.send(("UPDATE crawl SET screen_res = ?, ua_string = ? \ WHERE crawl_id = ?", (screen_res, ua_string, browser.crawl_id))) def _manager_watchdog(self): """ Periodically checks the following: - memory consumption of all browsers every 10 seconds - presence of processes that are no longer in use """ while not self.closing: time.sleep(10) # Check browser memory usage for browser in self.browsers: try: process = psutil.Process(browser.browser_pid) mem = process.memory_info()[0] / float(2**20) if mem > BROWSER_MEMORY_LIMIT: self.logger.info( "BROWSER %i: Memory usage: %iMB, exceeding limit of %iMB" % (browser.crawl_id, int(mem), BROWSER_MEMORY_LIMIT)) browser.restart_required = True except psutil.NoSuchProcess: pass # Check for browsers or displays that were not closed correctly # Provide a 300 second buffer to avoid killing freshly launched browsers # TODO This buffer should correspond to the maximum browser spawn timeout if self.process_watchdog: browser_pids = set() display_pids = set() check_time = time.time() for browser in self.browsers: if browser.browser_pid is not None: browser_pids.add(browser.browser_pid) if browser.display_pid is not None: display_pids.add(browser.display_pid) for process in psutil.process_iter(): if (process.create_time() + 300 < check_time and ((process.name() == 'firefox' and process.pid not in browser_pids) or (process.name() == 'Xvfb' and process.pid not in display_pids))): self.logger.debug( "Process: %s (pid: %i) with start time %s found running but not in browser process list. Killing." % (process.name(), process.pid, process.create_time())) process.kill() def _launch_aggregators(self): """ Launches the various data aggregators, which serialize data from all processes. * DataAggregator - sqlite database for crawl data * LevelDBAggregator - leveldb database for javascript files """ # DataAggregator self.aggregator_status_queue = Queue() self.data_aggregator = Process(target=DataAggregator.DataAggregator, args=(self.manager_params, self.aggregator_status_queue)) self.data_aggregator.daemon = True self.data_aggregator.start() self.manager_params[ 'aggregator_address'] = self.aggregator_status_queue.get( ) # socket location: (address, port) # LevelDB Aggregator if self.ldb_enabled: self.ldb_status_queue = Queue() self.ldb_aggregator = Process( target=LevelDBAggregator.LevelDBAggregator, args=(self.manager_params, self.ldb_status_queue)) self.ldb_aggregator.daemon = True self.ldb_aggregator.start() self.manager_params['ldb_address'] = self.ldb_status_queue.get( ) # socket location: (address, port) def _kill_aggregators(self): """ Terminates the aggregators gracefully """ # DataAggregator self.logger.debug("Telling the DataAggregator to shut down...") self.aggregator_status_queue.put("DIE") start_time = time.time() self.data_aggregator.join(300) self.logger.debug("DataAggregator took " + str(time.time() - start_time) + " seconds to close") # LevelDB Aggregator if self.ldb_enabled: self.logger.debug("Telling the LevelDBAggregator to shut down...") self.ldb_status_queue.put("DIE") start_time = time.time() self.ldb_aggregator.join(300) self.logger.debug("LevelDBAggregator took " + str(time.time() - start_time) + " seconds to close") def _launch_loggingserver(self): """ sets up logging server """ self.logging_status_queue = Queue() loggingserver = Process(target=MPLogger.loggingserver, args=( self.manager_params['log_file'], self.logging_status_queue, )) loggingserver.daemon = True loggingserver.start() return loggingserver def _kill_loggingserver(self): """ terminates logging server gracefully """ self.logging_status_queue.put("DIE") self.loggingserver.join(300) def _shutdown_manager(self, failure=False, during_init=False): """ Wait for current commands to finish, close all child processes and threads <failure> flag to indicate manager failure (True) or end of crawl (False) <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization """ self.closing = True for browser in self.browsers: browser.shutdown_browser(during_init) if failure: self.sock.send( ("UPDATE crawl SET finished = -1 WHERE crawl_id = ?", (browser.crawl_id, ))) else: self.sock.send( ("UPDATE crawl SET finished = 1 WHERE crawl_id = ?", (browser.crawl_id, ))) self.db.close() # close db connection self.sock.close() # close socket to data aggregator self._kill_aggregators() self._kill_loggingserver() def _cleanup_before_fail(self, during_init=False): """ Execute shutdown commands before throwing an exception This should keep us from having a bunch of hanging processes and incomplete data. <during_init> flag to indicator if this shutdown is occuring during the TaskManager initialization """ self._shutdown_manager(failure=True, during_init=during_init) def _check_failure_status(self): """ Check the status of command failures. Raise exceptions as necessary The failure status property is used by the various asynchronous command execution threads which interface with the remote browser manager processes. If a failure status is found, the appropriate steps are taken to gracefully close the infrastructure """ self.logger.debug("Checking command failure status indicator...") if self.failure_status: self.logger.debug( "TaskManager failure status set, halting command execution.") self._cleanup_before_fail() if self.failure_status['ErrorType'] == 'ExceedCommandFailureLimit': raise CommandExecutionError( "TaskManager exceeded maximum consecutive command " "execution failures.", self.failure_status['CommandSequence']) elif self.failure_status[ 'ErrorType'] == 'ExceedLaunchFailureLimit': raise CommandExecutionError( "TaskManager failed to launch browser within allowable " "failure limit.", self.failure_status['CommandSequence']) if self.failure_status['ErrorType'] == 'CriticalChildException': reraise(*cPickle.loads(self.failure_status['Exception'])) # CRAWLER COMMAND CODE def _distribute_command(self, command_sequence, index=None): """ parses command type and issues command(s) to the proper browser <index> specifies the type of command this is: = None -> first come, first serve = # -> index of browser to send command to = * -> sends command to all browsers = ** -> sends command to all browsers (synchronized) """ if index is None: #send to first browser available command_executed = False while True: for browser in self.browsers: if browser.ready(): browser.current_timeout = command_sequence.total_timeout thread = self._start_thread(browser, command_sequence) command_executed = True break if command_executed: break time.sleep(SLEEP_CONS) elif 0 <= index < len(self.browsers): #send the command to this specific browser while True: if self.browsers[index].ready(): self.browsers[ index].current_timeout = command_sequence.total_timeout thread = self._start_thread(self.browsers[index], command_sequence) break time.sleep(SLEEP_CONS) elif index == '*': #send the command to all browsers command_executed = [False] * len(self.browsers) while False in command_executed: for i in xrange(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: self.browsers[ i].current_timeout = command_sequence.total_timeout thread = self._start_thread(self.browsers[i], command_sequence) command_executed[i] = True time.sleep(SLEEP_CONS) elif index == '**': #send the command to all browsers and sync it condition = threading.Condition( ) # Used to block threads until ready command_executed = [False] * len(self.browsers) while False in command_executed: for i in xrange(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: self.browsers[ i].current_timeout = command_sequence.total_timeout thread = self._start_thread(self.browsers[i], command_sequence, condition) command_executed[i] = True time.sleep(SLEEP_CONS) with condition: condition.notifyAll( ) # All browsers loaded, tell them to start else: self.logger.info( "Command index type is not supported or out of range") return if command_sequence.blocking: thread.join() self._check_failure_status() def _start_thread(self, browser, command_sequence, condition=None): """ starts the command execution thread """ # Check status flags before starting thread if self.closing: self.logger.error( "Attempted to execute command on a closed TaskManager") return self._check_failure_status() browser.set_visit_id(self.next_visit_id) self.sock.send(( "INSERT INTO site_visits (visit_id, crawl_id, site_url) VALUES (?,?,?)", (self.next_visit_id, browser.crawl_id, command_sequence.url))) self.next_visit_id += 1 # Start command execution thread args = (browser, command_sequence, condition) thread = threading.Thread(target=self._issue_command, args=args) browser.command_thread = thread thread.daemon = True thread.start() return thread def _issue_command(self, browser, command_sequence, condition=None): """ sends command tuple to the BrowserManager """ browser.is_fresh = False # since we are issuing a command, the BrowserManager is no longer a fresh instance # if this is a synced call, block on condition if condition is not None: with condition: condition.wait() reset = command_sequence.reset start_time = None # tracks when a site visit started, so that flash/profile # cookies can be properly tracked. for command_and_timeout in command_sequence.commands_with_timeout: command, timeout = command_and_timeout if command[0] in ['GET', 'BROWSE']: start_time = time.time() command += (browser.curr_visit_id, ) elif command[0] in ['DUMP_FLASH_COOKIES', 'DUMP_PROFILE_COOKIES']: command += ( start_time, browser.curr_visit_id, ) browser.current_timeout = timeout # passes off command and waits for a success (or failure signal) browser.command_queue.put(command) command_succeeded = 0 #1 success, 0 failure from error, -1 timeout command_arguments = command[1] if len(command) > 1 else None # received reply from BrowserManager, either success signal or failure notice try: status = browser.status_queue.get(True, browser.current_timeout) if status == "OK": command_succeeded = 1 elif status[0] == "CRITICAL": self.logger.critical( "BROWSER %i: Received critical error " "from browser process while executing " "command %s. Setting failure status." % (browser.crawl_id, str(command))) self.failure_status = { 'ErrorType': 'CriticalChildException', 'CommandSequence': command_sequence, 'Exception': status[1] } return else: command_succeeded = 0 self.logger.info( "BROWSER %i: Received failure status while" " executing command: %s" % (browser.crawl_id, command[0])) except EmptyQueue: command_succeeded = -1 self.logger.info( "BROWSER %i: Timeout while executing command, " "%s, killing browser manager" % (browser.crawl_id, command[0])) self.sock.send(( "INSERT INTO CrawlHistory (crawl_id, command, arguments, bool_success)" " VALUES (?,?,?,?)", (browser.crawl_id, command[0], command_arguments, command_succeeded))) if command_succeeded != 1: with self.threadlock: self.failurecount += 1 if self.failurecount > self.failure_limit: self.logger.critical( "BROWSER %i: Command execution failure" " pushes failure count above the allowable limit." " Setting failure_status." % browser.crawl_id) self.failure_status = { 'ErrorType': 'ExceedCommandFailureLimit', 'CommandSequence': command_sequence } return browser.restart_required = True else: with self.threadlock: self.failurecount = 0 if browser.restart_required: break if self.closing: return if browser.restart_required or reset: success = browser.restart_browser_manager(clear_profile=reset) if not success: self.logger.critical( "BROWSER %i: Exceeded the maximum allowable " "consecutive browser launch failures. " "Setting failure_status." % browser.crawl_id) self.failure_status = { 'ErrorType': 'ExceedLaunchFailureLimit', 'CommandSequence': command_sequence } return browser.restart_required = False def execute_command_sequence(self, command_sequence, index=None): self._distribute_command(command_sequence, index) # DEFINITIONS OF HIGH LEVEL COMMANDS # NOTE: These wrappers are provided for convenience. To issue sequential # commands to the same browser in a single 'visit', use the CommandSequence # class directly. def get(self, url, index=None, timeout=60, sleep=0, reset=False): """ goes to a url """ command_sequence = CommandSequence.CommandSequence(url) command_sequence.get(timeout=timeout, sleep=sleep) command_sequence.reset = reset self.execute_command_sequence(command_sequence, index=index) def browse(self, url, num_links=2, sleep=0, index=None, timeout=60, reset=False): """ browse a website and visit <num_links> links on the page """ command_sequence = CommandSequence.CommandSequence(url) command_sequence.get(sleep=sleep, timeout=timeout) command_sequence.reset = reset self.execute_command_sequence(command_sequence, index=index) def close(self): """ Execute shutdown procedure for TaskManager """ if self.closing: self.logger.error("TaskManager already closed") return self._shutdown_manager()
class BaseAggregator: """Base class for the data aggregator interface. This class is used alongside the BaseListener class to spawn an aggregator process that combines data from multiple crawl processes. The BaseAggregator class manages the child listener process. Parameters ---------- manager_params : ManagerParamsInternal TaskManager configuration parameters browser_params : list of BrowserParamsInternal List of browser configuration class<BrowserParams>""" __metaclass__ = abc.ABCMeta def __init__( self, manager_params: ManagerParamsInternal, browser_params: List[BrowserParamsInternal], ): self.manager_params = manager_params self.browser_params = browser_params self.listener_address = None self.listener_process = None self.status_queue = Queue() self.completion_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received = None self.logger = logging.getLogger("openwpm") @abc.abstractmethod def save_configuration(self, openwpm_version, browser_version): """Save configuration details to the database""" @abc.abstractmethod def get_next_visit_id(self): """Return a unique visit ID to be used as a key for a single visit""" @abc.abstractmethod def get_next_browser_id(self): """Return a unique crawl ID used as a key for a browser instance""" def get_most_recent_status(self): """Return the most recent queue size sent from the listener process""" # Block until we receive the first status update if self._last_status is None: return self.get_status() # Drain status queue until we receive most recent update while not self.status_queue.empty(): self._last_status = self.status_queue.get() self._last_status_received = time.time() # Check last status signal if (time.time() - self._last_status_received) > STATUS_TIMEOUT: raise RuntimeError( "No status update from DataAggregator listener process " "for %d seconds." % (time.time() - self._last_status_received)) return self._last_status def get_status(self): """Get listener process status. If the status queue is empty, block.""" try: self._last_status = self.status_queue.get(block=True, timeout=STATUS_TIMEOUT) self._last_status_received = time.time() except queue.Empty: raise RuntimeError( "No status update from DataAggregator listener process " "for %d seconds." % (time.time() - self._last_status_received)) return self._last_status def get_new_completed_visits(self) -> List[Tuple[int, bool]]: """ Returns a list of all visit ids that have been processed since the last time the method was called and whether or not they have been interrupted. This method will return an empty list in case no visit ids have been processed since the last time this method was called """ finished_visit_ids = list() while not self.completion_queue.empty(): finished_visit_ids.append(self.completion_queue.get()) return finished_visit_ids def launch(self, listener_process_runner, *args): """Launch the aggregator listener process""" args = ((self.status_queue, self.completion_queue, self.shutdown_queue), ) + args self.listener_process = Process(target=listener_process_runner, args=args) self.listener_process.daemon = True self.listener_process.start() self.listener_address = self.status_queue.get() def shutdown(self, relaxed: bool = True): """ Terminate the aggregator listener process""" self.logger.debug( "Sending the shutdown signal to the %s listener process..." % type(self).__name__) self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed)) start_time = time.time() self.listener_process.join(300) self.logger.debug("%s took %s seconds to close." % (type(self).__name__, str(time.time() - start_time))) self.listener_address = None self.listener_process = None
class DataStreamSyncer: def __init__(self, data_stream_recorders, frequency=0): """ Instantiates a new DataStreamSyncer Parameters ---------- data_stream_recorders : list of DataStreamRecorders to sync frequency : float, optional Frequency in hz used for ratelimiting. If set to 0 or less, will not rate limit. Defaults to 0. """ self._cmds_q = Queue() self._tokens_q = Queue() self._data_stream_recorders = data_stream_recorders ok_qs = {} for data_stream_recorder in self._data_stream_recorders: ok_q = Queue() name = data_stream_recorder.name if name in ok_qs: raise ValueError( "Data Stream Recorders must have unique names! " f"{name} is a duplicate!") ok_qs[name] = ok_q data_stream_recorder._set_qs(ok_q, self._tokens_q) self._syncer = _DataStreamSyncer(frequency, ok_qs, self._cmds_q, self._tokens_q) self._syncer.start() def start(self): """Starts syncer operations""" for recorder in self._data_stream_recorders: recorder._start_recording() def stop(self): """Stops syncer operations. Destroys syncer process.""" self._cmds_q.put(("stop", )) for recorder in self._data_stream_recorders: recorder._stop() try: self._syncer.terminate() except Exception: pass def pause(self): self._cmds_q.put(("pause", )) for recorder in self._data_stream_recorders: recorder._pause() def resume(self, reset_time=False): self._cmds_q.put(("resume", reset_time)) for recorder in self._data_stream_recorders: recorder._resume() def reset_time(self): self._cmds_q.put(("reset_time", )) def flush(self): data = {} for recorder in self._data_stream_recorders: data[recorder.name] = recorder._flush() return data
logger = logging.getLogger('PMS') logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(name)s %(levelname)s %(message)s') fh = logging.handlers.TimedRotatingFileHandler(logfilename, 'W0', 1, 0) fh.suffix = "%Y%m%d-%H%M.log" fh.setFormatter(formatter) logger.addHandler(fh) consol = logging.StreamHandler() consol.setFormatter(formatter) logger.addHandler(consol) return logger logobj = get_logger('pms.log') logQueue = Queue() logobj.info('Starting scheduler...') scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc) scheduler.start() atexit.register(lambda: scheduler.shutdown(wait=True)) logobj.info('Starting TaskCore...') taskobj = task_core.TaskCore(scheduler, logQueue, logobj, TASK_PATH) app = Flask(__name__) app.config['SECRET_KEY'] = 'secret!' socketio = SocketIO(app, async_mode='threading')
class BaseAggregator(object): """Base class for the data aggregator interface. This class is used alongside the BaseListener class to spawn an aggregator process that combines data from multiple crawl processes. The BaseAggregator class manages the child listener process. Parameters ---------- manager_params : dict TaskManager configuration parameters browser_params : list of dict List of browser configuration dictionaries""" __metaclass__ = abc.ABCMeta def __init__(self, manager_params, browser_params): self.manager_params = manager_params self.browser_params = browser_params self.listener_address = None self.listener_process = None self.status_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received = None self.logger = logging.getLogger('openwpm') @abc.abstractmethod def save_configuration(self, openwpm_version, browser_version): """Save configuration details to the database""" @abc.abstractmethod def get_next_visit_id(self): """Return a unique visit ID to be used as a key for a single visit""" @abc.abstractmethod def get_next_crawl_id(self): """Return a unique crawl ID used as a key for a browser instance""" def get_most_recent_status(self): """Return the most recent queue size sent from the listener process""" # Block until we receive the first status update if self._last_status is None: return self.get_status() # Drain status queue until we receive most recent update while not self.status_queue.empty(): self._last_status = self.status_queue.get() self._last_status_received = time.time() # Check last status signal if (time.time() - self._last_status_received) > STATUS_TIMEOUT: raise RuntimeError( "No status update from DataAggregator listener process " "for %d seconds." % (time.time() - self._last_status_received)) return self._last_status def get_status(self): """Get listener process status. If the status queue is empty, block.""" try: self._last_status = self.status_queue.get(block=True, timeout=STATUS_TIMEOUT) self._last_status_received = time.time() except queue.Empty: raise RuntimeError( "No status update from DataAggregator listener process " "for %d seconds." % (time.time() - self._last_status_received)) return self._last_status def launch(self, listener_process_runner, *args): """Launch the aggregator listener process""" args = (self.manager_params, self.status_queue, self.shutdown_queue) + args self.listener_process = Process(target=listener_process_runner, args=args) self.listener_process.daemon = True self.listener_process.start() self.listener_address = self.status_queue.get() def shutdown(self): """ Terminate the aggregator listener process""" self.logger.debug( "Sending the shutdown signal to the %s listener process..." % type(self).__name__) self.shutdown_queue.put(SHUTDOWN_SIGNAL) start_time = time.time() self.listener_process.join(300) self.logger.debug("%s took %s seconds to close." % (type(self).__name__, str(time.time() - start_time))) self.listener_address = None self.listener_process = None
def assign_multiprocess_ext(function, data, pool_args={}, **task_args): from multiprocess import Queue, Process, cpu_count from Queue import Full, Empty from time import sleep process_count = pool_args.get('processes', cpu_count() - 1) input_pipe, output_pipe, control_pipe = (Queue(process_count), Queue(process_count), Queue(process_count)) stop_signal = hash('OK STOP NAO.') def multiprocessor(inpipe, outpipe, controlpipe): def returner_process(inp, outp, task): args, kwargs = inp.get() outpipe.put(task(*args, **kwargs)) return True jobs = [] while True: done = [x for x in jobs if x.ready()] if done: jobs = [x for x in jobs if x not in done] # Avoids race condition! else: sleep(0.1) for thing in done: thing.successful() assert thing.get() while len(jobs) < process_count: cmd = controlpipe.get() if cmd == stop_signal: break elif cmd == True: newjob = Process(target=returner_process, args=(inpipe, outpipe)) newjob.start() jobs.append(newjob) # I *think* the pipes have to be passed explicitly, # but I haven't checked. else: raise Exception outpipe.put(stop_signal) multiproc_proc = Process(target=multiprocessor, args=(input_pipe, output_pipe, control_pipe)) multiproc_proc.start() if isinstance(data, list): data = (x for x in data) nexttask = next(data) while True: try: input_pipe.put_nowait(nexttask) control_pipe.put_nowait(True) nexttask = next(data) except Full: pass except StopIteration: break try: yield output_pipe.get_nowait() except Empty: sleep(0.1) control_pipe.put(stop_signal) while True: try: out = output_pipe.get() if out == stop_signal: break else: yield out except Empty: sleep(0.1) multiproc_proc.join()
def launch_browser_manager(self) -> bool: """ sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid. loads associated user profile if necessary """ tempdir: Optional[str] = None crash_recovery = False # if this is restarting from a crash, update the tar location # to be a tar of the crashed browser's history if self.current_profile_path is not None: # tar contents of crashed profile to a temp dir tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_") tar_path = Path(tempdir) / "profile.tar" dump_profile( browser_profile_path=self.current_profile_path, tar_path=tar_path, compress=False, browser_params=self.browser_params, ) # make sure browser loads crashed profile self.browser_params.recovery_tar = tar_path crash_recovery = True self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) self.is_fresh = not crash_recovery # Try to spawn the browser within the timelimit unsuccessful_spawns = 0 success = False def check_queue(launch_status: Dict[str, bool]) -> Any: assert self.status_queue is not None result = self.status_queue.get(True, self._SPAWN_TIMEOUT) if result[0] == "STATUS": launch_status[result[1]] = True return result[2] elif result[0] == "CRITICAL": _, exc, tb = pickle.loads(result[1]) raise exc.with_traceback(tb) elif result[0] == "FAILED": raise BrowserCrashError("Browser spawn returned failure status") while not success and unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT: self.logger.debug( "BROWSER %i: Spawn attempt %i " % (self.browser_id, unsuccessful_spawns) ) # Resets the command/status queues (self.command_queue, self.status_queue) = (Queue(), Queue()) # builds and launches the browser_manager self.browser_manager = BrowserManager( self.command_queue, self.status_queue, self.browser_params, self.manager_params, crash_recovery, ) self.browser_manager.daemon = True self.browser_manager.start() # Read success status of browser manager launch_status: Dict[str, bool] = dict() try: # 1. Browser profile created browser_profile_path = check_queue(launch_status) # 2. Profile tar loaded (if necessary) check_queue(launch_status) # 3. Display launched (if necessary) self.display_pid, self.display_port = check_queue(launch_status) # 4. Browser launch attempted check_queue(launch_status) # 5. Browser launched self.geckodriver_pid = check_queue(launch_status) ready = check_queue(launch_status) if ready != "READY": self.logger.error( "BROWSER %i: Mismatch of status queue return values, " "trying again..." % self.browser_id ) unsuccessful_spawns += 1 continue success = True except (EmptyQueue, BrowserCrashError): unsuccessful_spawns += 1 error_string = "" status_strings = [ "Profile Created", "Profile Tar", "Display", "Launch Attempted", "Browser Launched", "Browser Ready", ] for string in status_strings: error_string += " | %s: %s " % ( string, launch_status.get(string, False), ) self.logger.error( "BROWSER %i: Spawn unsuccessful %s" % (self.browser_id, error_string) ) self.close_browser_manager() if "Profile Created" in launch_status: shutil.rmtree(browser_profile_path, ignore_errors=True) # If the browser spawned successfully, we should update the # current profile path class variable and clean up the tempdir # and previous profile path. if success: self.logger.debug("BROWSER %i: Browser spawn successful!" % self.browser_id) previous_profile_path = self.current_profile_path self.current_profile_path = browser_profile_path if previous_profile_path is not None: shutil.rmtree(previous_profile_path, ignore_errors=True) if tempdir is not None: shutil.rmtree(tempdir, ignore_errors=True) return success
class StorageControllerHandle: """This class contains all methods relevant for the TaskManager to interact with the StorageController """ def __init__( self, structured_storage: StructuredStorageProvider, unstructured_storage: Optional[UnstructuredStorageProvider], ) -> None: self.listener_address: Optional[Tuple[str, int]] = None self.listener_process: Optional[Process] = None self.status_queue = Queue() self.completion_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received: Optional[float] = None self.logger = logging.getLogger("openwpm") self.storage_controller = StorageController( structured_storage, unstructured_storage, status_queue=self.status_queue, completion_queue=self.completion_queue, shutdown_queue=self.shutdown_queue, ) def get_next_visit_id(self) -> VisitId: """Generate visit id as randomly generated positive integer less than 2^53. Parquet can support integers up to 64 bits, but Javascript can only represent integers up to 53 bits: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/MAX_SAFE_INTEGER Thus, we cap these values at 53 bits. """ return VisitId(random.getrandbits(53)) def get_next_browser_id(self) -> BrowserId: """Generate crawl id as randomly generated positive 32bit integer Note: Parquet's partitioned dataset reader only supports integer partition columns up to 32 bits. """ return BrowserId(random.getrandbits(32)) def save_configuration( self, manager_params: ManagerParamsInternal, browser_params: List[BrowserParamsInternal], openwpm_version: str, browser_version: str, ) -> None: assert self.listener_address is not None sock = DataSocket(self.listener_address) task_id = random.getrandbits(32) sock.store_record( TableName("task"), INVALID_VISIT_ID, { "task_id": task_id, "manager_params": manager_params.to_json(), "openwpm_version": openwpm_version, "browser_version": browser_version, }, ) # Record browser details for each browser for browser_param in browser_params: sock.store_record( TableName("crawl"), INVALID_VISIT_ID, { "browser_id": browser_param.browser_id, "task_id": task_id, "browser_params": browser_param.to_json(), }, ) sock.finalize_visit_id(INVALID_VISIT_ID, success=True) def launch(self) -> None: """Starts the storage controller""" self.storage_controller = Process( name="StorageController", target=StorageController.run, args=(self.storage_controller, ), ) self.storage_controller.daemon = True self.storage_controller.start() self.listener_address = self.status_queue.get() def get_new_completed_visits(self) -> List[Tuple[int, bool]]: """ Returns a list of all visit ids that have been processed since the last time the method was called and whether or not they ran successfully. This method will return an empty list in case no visit ids have been processed since the last time this method was called """ finished_visit_ids = list() while not self.completion_queue.empty(): finished_visit_ids.append(self.completion_queue.get()) return finished_visit_ids def shutdown(self, relaxed: bool = True) -> None: """Terminate the storage controller process""" assert isinstance(self.storage_controller, Process) self.logger.debug( "Sending the shutdown signal to the Storage Controller...") self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed)) start_time = time.time() self.storage_controller.join(300) self.logger.debug("%s took %s seconds to close." % (type(self).__name__, str(time.time() - start_time))) def get_most_recent_status(self) -> int: """Return the most recent queue size sent from the Storage Controller process""" # Block until we receive the first status update if self._last_status is None: return self.get_status() # Drain status queue until we receive most recent update while not self.status_queue.empty(): self._last_status = self.status_queue.get() self._last_status_received = time.time() # Check last status signal if (time.time() - self._last_status_received) > STATUS_TIMEOUT: raise RuntimeError( "No status update from the storage controller process " "for %d seconds." % (time.time() - self._last_status_received)) return self._last_status def get_status(self) -> int: """Get listener process status. If the status queue is empty, block.""" try: self._last_status = self.status_queue.get(block=True, timeout=STATUS_TIMEOUT) self._last_status_received = time.time() except queue.Empty: assert self._last_status_received is not None raise RuntimeError( "No status update from the storage controller process " "for %d seconds." % (time.time() - self._last_status_received)) assert isinstance(self._last_status, int) return self._last_status
class MultiprocessorFitnessCaller: """ Fitness caller used for multiprocessor parallelism. Arguments --------- num_workers : int Number of worker nodes to create. """ def __init__(self, num_workers): self.num_workers = num_workers self.problem = None self.total_tasks = 0 self.total_groups = 0 self.max_group_size = 0 self.tasks = [] self.num_calls = 0 self.max_nodes = 0 self.min_num_calls = 0 self.num_workers = self.num_workers self.processes = [] self.task_queue = Queue() self.result_queue = Queue() def __enter__(self): return self def set_problem(self, problem): """ Sets the problem object to use to calculate the fitness. Arguments --------- problem Problem object implementing the fitness method. """ for _ in range(self.num_workers): p = Process(target=multiprocessor_process, args=(problem, self.task_queue, self.result_queue)) p.start() self.processes.append(p) def add(self, location, userdata): """ Add a location to be evaluated. Arguments --------- location : numpy array Location to be evaluated. userdata User data to be returned with the evaluation result. """ self.tasks.append([location, userdata]) def evaluate(self): """ Evaluates all the locations. Returns ------- list of (location, value, userdate) tuples Tuples containing the location, value and corresponding user data """ num_tasks = len(self.tasks) self.total_tasks += num_tasks self.total_groups += 1 if num_tasks > self.max_group_size: self.max_group_size = num_tasks for i in range(num_tasks): self.task_queue.put([i, self.tasks[i][0]]) # [index, loc] y = 0 num_results = 0 results = [] while num_results < num_tasks: result = self.result_queue.get() # [ index, y] index = result[0] y = result[1] results.append(( self.tasks[index][0], y, self.tasks[index][1], )) num_results += 1 self.tasks = [] return results def __exit__(self, exc_type, exc_val, exc_tb): self.finish() def finish(self): """ Terminates the fitness caller. """ for p in self.processes: self.task_queue.put([-1, -1]) for p in self.processes: p.join()
def __init__(self, filament, **kwargs): self._start = datetime.now() try: log_path = os.path.join(os.path.expanduser('~'), '.fibratus', 'fibratus.log') FileHandler(log_path, mode='w+').push_application() StreamHandler(sys.stdout).push_application() except PermissionError: panic( "ERROR - Unable to open log file for writing due to permission error" ) self.logger = Logger(Fibratus.__name__) self._config = YamlConfig() self.logger.info('Starting...') enable_cswitch = kwargs.pop('cswitch', False) self.kcontroller = KTraceController() self.ktrace_props = KTraceProps() self.ktrace_props.enable_kflags(cswitch=enable_cswitch) self.ktrace_props.logger_name = etw.KERNEL_LOGGER_NAME enum_handles = kwargs.pop('enum_handles', True) self.handle_repository = HandleRepository() self._handles = [] # query for handles on the # start of the kernel trace if enum_handles: self.logger.info('Enumerating system handles...') self._handles = self.handle_repository.query_handles() self.logger.info('%s handles found' % len(self._handles)) self.handle_repository.free_buffers() self.thread_registry = ThreadRegistry(self.handle_repository, self._handles) self.kevt_streamc = KEventStreamCollector( etw.KERNEL_LOGGER_NAME.encode()) image_skips = self._config.image_skips if len(image_skips) > 0: self.logger.info("Adding skips for images %s" % image_skips) for skip in image_skips: self.kevt_streamc.add_skip(skip) self.kevent = KEvent(self.thread_registry) self.keventq = Queue() self._output_classes = dict(console=ConsoleOutput, amqp=AmqpOutput, smtp=SmtpOutput, elasticsearch=ElasticsearchOutput) self._outputs = self._construct_outputs() if filament: filament.keventq = self.keventq filament.logger = log_path filament.setup_adapters(self._outputs) self._filament = filament self.fsio = FsIO(self.kevent, self._handles) self.hive_parser = HiveParser(self.kevent, self.thread_registry) self.tcpip_parser = TcpIpParser(self.kevent) self.dll_repository = DllRepository(self.kevent) self.context_switch_registry = ContextSwitchRegistry( self.thread_registry, self.kevent) self.output_kevents = {} self.filters_count = 0
exc = traceback.format_exc() self.log_message("%s", exc.strip()) message = HTML_INTERNAL_SERVER_ERROR.format(error_message=exc) self.wfile.write(message.encode("utf8")) ### Logging if __name__ == '__main__': print('\n### Logging') from multiprocess import Queue HTTPD_MESSAGE_QUEUE = Queue() HTTPD_MESSAGE_QUEUE.put("I am another message") HTTPD_MESSAGE_QUEUE.put("I am one more message") from .bookutils import rich_output, terminal_escape def display_httpd_message(message): if rich_output(): display( HTML( '<pre style="background: NavajoWhite;">' + message + "</pre>")) else:
class BaseAggregator(object): """Base class for the data aggregator interface. This class is used alongside the BaseListener class to spawn an aggregator process that combines data from multiple crawl processes. The BaseAggregator class manages the child listener process. Parameters ---------- manager_params : dict TaskManager configuration parameters browser_params : list of dict List of browser configuration dictionaries""" __metaclass__ = abc.ABCMeta def __init__(self, manager_params, browser_params): self.manager_params = manager_params self.browser_params = browser_params self.logger = loggingclient(*manager_params['logger_address']) self.listener_address = None self.listener_process = None self.status_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received = None @abc.abstractmethod def save_configuration(self, openwpm_version, browser_version): """Save configuration details to the database""" @abc.abstractmethod def get_next_visit_id(self): """Return a unique visit ID to be used as a key for a single visit""" @abc.abstractmethod def get_next_crawl_id(self): """Return a unique crawl ID used as a key for a browser instance""" def get_most_recent_status(self): """Return the most recent queue size sent from the listener process""" # Block until we receive the first status update if self._last_status is None: return self.get_status() # Drain status queue until we receive most recent update while not self.status_queue.empty(): self._last_status = self.status_queue.get() self._last_status_received = time.time() # Check last status signal if (time.time() - self._last_status_received) > STATUS_TIMEOUT: raise RuntimeError( "No status update from DataAggregator listener process " "for %d seconds." % (time.time() - self._last_status_received) ) return self._last_status def get_status(self): """Get listener process status. If the status queue is empty, block.""" try: self._last_status = self.status_queue.get( block=True, timeout=STATUS_TIMEOUT) self._last_status_received = time.time() except queue.Empty: raise RuntimeError( "No status update from DataAggregator listener process " "for %d seconds." % (time.time() - self._last_status_received) ) return self._last_status def launch(self, listener_process_runner, *args): """Launch the aggregator listener process""" args = (self.manager_params, self.status_queue, self.shutdown_queue) + args self.listener_process = Process( target=listener_process_runner, args=args ) self.listener_process.daemon = True self.listener_process.start() self.listener_address = self.status_queue.get() def shutdown(self): """ Terminate the aggregator listener process""" self.logger.debug( "Sending the shutdown signal to the %s listener process..." % type(self).__name__ ) self.shutdown_queue.put(SHUTDOWN_SIGNAL) start_time = time.time() self.listener_process.join(300) self.logger.debug( "%s took %s seconds to close." % ( type(self).__name__, str(time.time() - start_time) ) ) self.listener_address = None self.listener_process = None