def _command_stop(self, command): run_id = command['run_id'] if run_id in self._run_processes: proc = self._run_processes[run_id] now = util.stopwatch_now() if proc.last_sigterm_time is None: proc.last_sigterm_time = now logger.info('Stop: %s', run_id) try: proc.terminate() except OSError: # if process is already dead pass elif now > proc.last_sigterm_time + self._kill_delay: logger.info('Kill: %s', run_id) try: proc.kill() except OSError: # if process is already dead pass else: logger.error('Run %s not running', run_id)
if sweep_command and isinstance(sweep_command, list): self._sweep_command = sweep_command # TODO: include sweep ID agent = self._api.register_agent(socket.gethostname(), sweep_id=self._sweep_id) agent_id = agent["id"] try: while self._running: commands = util.read_many_from_queue(self._queue, 100, self.POLL_INTERVAL) for command in commands: command["resp_queue"].put(self._process_command(command)) now = util.stopwatch_now() if self._last_report_time is None or ( self._report_interval != 0 and now > self._last_report_time + self._report_interval): logger.info("Running runs: %s", list(self._run_processes.keys())) self._last_report_time = now run_status = {} for run_id, run_process in list( six.iteritems(self._run_processes)): poll_result = run_process.poll() if poll_result is None: run_status[run_id] = True continue elif (not isinstance(poll_result, bool) and isinstance(poll_result, int)
def run(self): # TODO: catch exceptions, handle errors, show validation warnings, and make more generic sweep_obj = self._api.sweep(self._sweep_id, "{}") if sweep_obj: sweep_yaml = sweep_obj.get('config') if sweep_yaml: sweep_config = yaml.safe_load(sweep_yaml) if sweep_config: sweep_command = sweep_config.get('command') if sweep_command and isinstance(sweep_command, list): self._sweep_command = sweep_command # TODO: include sweep ID agent = self._api.register_agent(socket.gethostname(), sweep_id=self._sweep_id) agent_id = agent['id'] try: while self._running: commands = util.read_many_from_queue(self._queue, 100, self.POLL_INTERVAL) for command in commands: command['resp_queue'].put(self._process_command(command)) now = util.stopwatch_now() if self._last_report_time is None or ( self._report_interval != 0 and now > self._last_report_time + self._report_interval): logger.info('Running runs: %s', list(self._run_processes.keys())) self._last_report_time = now run_status = {} for run_id, run_process in list( six.iteritems(self._run_processes)): poll_result = run_process.poll() if poll_result is None: run_status[run_id] = True continue elif not isinstance(poll_result, bool) and isinstance( poll_result, int) and poll_result > 0: self._failed += 1 if self.is_flapping(): logger.error( "Detected %i failed runs in the first %i seconds, shutting down.", self.FLAPPING_MAX_FAILURES, self.FLAPPING_MAX_SECONDS) logger.info( "To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true" ) self._running = False break logger.info('Cleaning up finished run: %s', run_id) del self._run_processes[run_id] self._last_report_time = None self._finished += 1 if self._count and self._finished >= self._count or not self._running: self._running = False continue commands = self._api.agent_heartbeat(agent_id, {}, run_status) # TODO: send _server_responses self._server_responses = [] for command in commands: self._server_responses.append( self._process_command(command)) except KeyboardInterrupt: try: wandb.termlog( 'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.' ) for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: pass finally: try: if not self._in_jupyter: wandb.termlog( 'Terminating and syncing runs. Press ctrl-c to kill.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.terminate() except OSError: pass # if process is already dead for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: wandb.termlog('Killing runs and quitting.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.kill() except OSError: pass # if process is already dead
def run(self): # TODO: include sweep ID agent = self._api.register_agent( socket.gethostname(), sweep_id=self._sweep_id) agent_id = agent['id'] try: while self._running: commands = util.read_many_from_queue( self._queue, 100, self.POLL_INTERVAL) for command in commands: command['resp_queue'].put(self._process_command(command)) now = util.stopwatch_now() if self._last_report_time is None or (self._report_interval != 0 and now > self._last_report_time + self._report_interval): logger.info('Running runs: %s', list( self._run_processes.keys())) self._last_report_time = now run_status = {} for run_id, run_process in list(six.iteritems(self._run_processes)): if run_process.poll() is None: run_status[run_id] = True else: logger.info('Cleaning up dead run: %s', run_id) del self._run_processes[run_id] self._last_report_time = None commands = self._api.agent_heartbeat(agent_id, {}, run_status) # TODO: send _server_responses self._server_responses = [] for command in commands: self._server_responses.append( self._process_command(command)) except KeyboardInterrupt: try: wandb.termlog( 'Ctrl-c pressed. Waiting for runs to end. Press ctrl-c again to terminate them.') for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: pass finally: try: if not self._in_jupyter: wandb.termlog( 'Terminating and syncing runs. Press ctrl-c to kill.') for run_id, run_process in six.iteritems(self._run_processes): try: run_process.terminate() except OSError: pass # if process is already dead for run_id, run_process in six.iteritems(self._run_processes): run_process.wait() except KeyboardInterrupt: wandb.termlog('Killing runs and quitting.') try: run_process.kill() except OSError: pass # if process is already dead