def __init__(self, path): import json self._path = path try: with open(path) as f: old_data = json.load(f) except json.JSONDecodeError: logger.warning('Could not decode config') old_data = {} except OSError: logger.debug('No config file') old_data = {} for i in range(10): try: self._f = open(path, 'w+') fcntl.flock(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB) self._locked = True break except BlockingIOError: import signal pid = old_data.get('pid') if pid: logger.info( f'Config file is locked (try {i}). Killing previous instance {pid}' ) os.kill(pid, signal.SIGTERM) time.sleep(.05) else: logger.error(f'Config file is locked and no pid to kill') assert self._locked
def tb_watcher(self): assert isinstance(self.tb, subprocess.Popen) outs, errs = self.tb.communicate() returncode = self.tb.returncode self.tb = None msg = 'tensorboard on {} for {} returned with code {}'.format( self.tb_port, self.path, returncode) if returncode == 0: logger.debug(msg) else: logger.warning(f'{msg}\n out: {outs}\n err: {errs}') logger.debug('tb watcher finished')
def tensorboard(self): has_event_files = glob.glob(self.path + '**/*.tfevents*', recursive=True) if not has_event_files: return dict(no_event_files=True) elif not self.tb: self.tb_port = get_free(self.server.port_pool) cmds = [ 'tensorboard', '--logdir', "{}".format(self.path), '--host', '0.0.0.0', '--port', str(self.tb_port) ] logger.debug('Start tensorboard with: ' + ' '.join(cmds)) self.tb = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) Thread(target=self.tb_watcher, daemon=True).start() @repeat_until(timeout=6.) def check_tb(): try: url = "http://{}:{}".format(self.host, self.tb_port) r = requests.get( url) # requests.head not supported by tensorboard available = r.status_code == 200 sleep(.3) logger.debug('tb on {} status {}, {}'.format( url, r.status_code, r.reason)) return available except requests.ConnectionError: return False if not check_tb: logger.warning('tb could not be started') self.tb_t = time() Thread(target=self.tb_killer, daemon=True).start() return dict(host=self.host, port=self.tb_port, new=True, available=check_tb, no_event_files=False) else: self.tb_t = time() # heartbeat # print('heartbeat') return dict(host=self.host, port=self.tb_port, new=False, available=True, no_event_files=False)
def tensorboard(self): if not self.tb: self.tb_port = get_free_port( self.host) # TODO: use self.host here? cmds = [ 'tensorboard', '--logdir', "{}".format(self.path), '--host', '0.0.0.0', '--port', str(self.tb_port) ] print(' '.join(cmds)) self.tb = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) Thread(target=self.tb_watcher, daemon=True).start() @repeat_until(timeout=3.) def check_tb(): try: url = "http://{}:{}".format(self.host, self.tb_port) r = requests.get( url) # requests.head not supported by tensorboard available = r.status_code == 200 sleep(.3) logger.debug('tb on {} status {}, {}'.format( url, r.status_code, r.reason)) return available except requests.ConnectionError: return False if not check_tb: logger.warning('tb could not be started') self.tb_t = time() Thread(target=self.tb_killer, daemon=True).start() return dict(host=self.host, port=self.tb_port, new=True, available=check_tb) else: self.tb_t = time() # heartbeat # print('heartbeat') return dict(host=self.host, port=self.tb_port, new=False, available=True)
def tb_watcher(self): assert isinstance(self.tb, subprocess.Popen) outs, errs = self.tb.communicate() returncode = self.tb.returncode self.tb = None msg = 'tensorboard on {} for {} returned with code {}'.format( self.tb_port, self.path, returncode) if returncode == 0: logger.debug(msg) else: logger.warning(msg) logger.warning('out: ' + outs) logger.warning('err: ' + errs) print('watcher finish')