def run(self): # TODO: include sweep ID agent = self._api.register_agent(socket.gethostname(), True, sweep_id=self._sweep_id) agent_id = agent['id'] try: while True: commands = util.read_many_from_queue(self._queue, 100, self.POLL_INTERVAL) for command in commands: command['resp_queue'].put(self._process_command(command)) logger.info('Running runs: %s', list(self._run_processes.keys())) run_status = {} for run_id, run_process in list( six.iteritems(self._run_processes)): if run_process.poll() is None: run_status[run_id] = True else: logger.info('Cleaning up dead run: %s', run_id) del self._run_processes[run_id] commands = self._api.agent_heartbeat(agent_id, {}, run_status) # TODO: send _server_responses self._server_responses = [] for command in commands: self._server_responses.append( self._process_command(command)) except KeyboardInterrupt: try: wandb.termlog( 'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.' ) for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: pass finally: try: wandb.termlog( 'Terminating and syncing runs. Press ctrl-c to kill.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.terminate() except OSError: pass # if process is already dead for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: wandb.termlog('Killing runs and quitting.') try: run_process.kill() except OSError: pass # if process is already dead
def _read_queue(self): # called from the push thread (_thread_body), this does an initial read # that'll block for up to rate_limit_seconds. Then it tries to read # as much out of the queue as it can. We do this because the http post # to the server happens within _thread_body, and can take longer than # our rate limit. So next time we get a chance to read the queue we want # read all the stuff that queue'd up since last time. # # If we have more than MAX_ITEMS_PER_PUSH in the queue then the push thread # will get behind and data will buffer up in the queue. return util.read_many_from_queue(self._queue, self.MAX_ITEMS_PER_PUSH, self.rate_limit_seconds())
def run(self): # TODO: catch exceptions, handle errors, show validation warnings, and make more generic sweep_obj = self._api.sweep(self._sweep_id, "{}") if sweep_obj: sweep_yaml = sweep_obj.get('config') if sweep_yaml: sweep_config = yaml.safe_load(sweep_yaml) if sweep_config: sweep_command = sweep_config.get('command') if sweep_command and isinstance(sweep_command, list): self._sweep_command = sweep_command # TODO: include sweep ID agent = self._api.register_agent(socket.gethostname(), sweep_id=self._sweep_id) agent_id = agent['id'] try: while self._running: commands = util.read_many_from_queue(self._queue, 100, self.POLL_INTERVAL) for command in commands: command['resp_queue'].put(self._process_command(command)) now = util.stopwatch_now() if self._last_report_time is None or ( self._report_interval != 0 and now > self._last_report_time + self._report_interval): logger.info('Running runs: %s', list(self._run_processes.keys())) self._last_report_time = now run_status = {} for run_id, run_process in list( six.iteritems(self._run_processes)): poll_result = run_process.poll() if poll_result is None: run_status[run_id] = True continue elif not isinstance(poll_result, bool) and isinstance( poll_result, int) and poll_result > 0: self._failed += 1 if self.is_flapping(): logger.error( "Detected %i failed runs in the first %i seconds, shutting down.", self.FLAPPING_MAX_FAILURES, self.FLAPPING_MAX_SECONDS) logger.info( "To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true" ) self._running = False break logger.info('Cleaning up finished run: %s', run_id) del self._run_processes[run_id] self._last_report_time = None self._finished += 1 if self._count and self._finished >= self._count or not self._running: self._running = False continue commands = self._api.agent_heartbeat(agent_id, {}, run_status) # TODO: send _server_responses self._server_responses = [] for command in commands: self._server_responses.append( self._process_command(command)) except KeyboardInterrupt: try: wandb.termlog( 'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.' ) for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: pass finally: try: if not self._in_jupyter: wandb.termlog( 'Terminating and syncing runs. Press ctrl-c to kill.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.terminate() except OSError: pass # if process is already dead for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: wandb.termlog('Killing runs and quitting.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.kill() except OSError: pass # if process is already dead
if sweep_obj := self._api.sweep(self._sweep_id, "{}"): if sweep_yaml := sweep_obj.get("config"): if sweep_config := yaml.safe_load(sweep_yaml): sweep_command = sweep_config.get("command") if sweep_command and isinstance(sweep_command, list): self._sweep_command = sweep_command # TODO: include sweep ID agent = self._api.register_agent(socket.gethostname(), sweep_id=self._sweep_id) agent_id = agent["id"] try: while self._running: commands = util.read_many_from_queue(self._queue, 100, self.POLL_INTERVAL) for command in commands: command["resp_queue"].put(self._process_command(command)) now = util.stopwatch_now() if self._last_report_time is None or ( self._report_interval != 0 and now > self._last_report_time + self._report_interval): logger.info("Running runs: %s", list(self._run_processes.keys())) self._last_report_time = now run_status = {} for run_id, run_process in list( six.iteritems(self._run_processes)): poll_result = run_process.poll() if poll_result is None:
def run(self): # TODO: include sweep ID agent = self._api.register_agent(socket.gethostname(), sweep_id=self._sweep_id) agent_id = agent['id'] try: while self._running: commands = util.read_many_from_queue(self._queue, 100, self.POLL_INTERVAL) for command in commands: command['resp_queue'].put(self._process_command(command)) now = util.stopwatch_now() if self._last_report_time is None or ( self._report_interval != 0 and now > self._last_report_time + self._report_interval): logger.info('Running runs: %s', list(self._run_processes.keys())) self._last_report_time = now run_status = {} for run_id, run_process in list( six.iteritems(self._run_processes)): if run_process.poll() is None: run_status[run_id] = True else: logger.info('Cleaning up finished run: %s', run_id) del self._run_processes[run_id] self._last_report_time = None self._finished += 1 if self._count and self._finished >= self._count: self._running = False continue commands = self._api.agent_heartbeat(agent_id, {}, run_status) # TODO: send _server_responses self._server_responses = [] for command in commands: self._server_responses.append( self._process_command(command)) except KeyboardInterrupt: try: wandb.termlog( 'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.' ) for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: pass finally: try: if not self._in_jupyter: wandb.termlog( 'Terminating and syncing runs. Press ctrl-c to kill.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.terminate() except OSError: pass # if process is already dead for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: wandb.termlog('Killing runs and quitting.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.kill() except OSError: pass # if process is already dead