def __load_configs(self, path: Path): if path == self.__current_path: return self.__current_path = path config_files = self.__load_config_files(path) if not config_files: if not is_colab() and not is_kaggle(): labml_notice([ (".labml.yaml", Text.value), " config file could not be found. Looking in path: ", (str(path), Text.meta) ]) while path.exists() and not path.is_dir(): path = path.parent for c in config_files: self.__merge_configs(c) for c in self.custom_configs: self.__merge_configs(c) if not config_files and self.configs['path'] is None: self.configs['path'] = str(path) self.__update_configs()
def _send(self, data: List[Dict[str, any]]) -> Dict: req = urllib.request.Request(self.url) req.add_header('Content-Type', 'application/json; charset=utf-8') data_json = json.dumps(data) data_json = data_json.encode('utf-8') req.add_header('Content-Length', str(len(data_json))) # print('Data size', len(data_json)) response = urllib.request.urlopen(req, data_json, timeout=self.timeout_seconds) content = response.read().decode('utf-8') result = json.loads(content) for e in result.get('errors', []): if 'error' in e: labml_notice([ 'LabML App Error: ', (e['error'] + '', Text.key), '\n', (e['message'], Text.value) ], is_danger=True) self.errored = True raise RuntimeError('LabML App Error', e) elif 'warning' in e: labml_notice([ 'LabML App Warning: ', (e['warning'] + ': ', Text.key), (e['message'], Text.value) ]) else: self.errored = True raise RuntimeError('Unknown error from LabML App', e) return result
def save(self, global_step): """ ## Save model as a set of numpy arrays """ if not self.model_savers: if not self.__no_savers_warned: labml_notice(["No models were registered for saving\n", "You can register models with ", ('experiment.add_pytorch_models', Text.value)]) self.__no_savers_warned = True return checkpoints_path = pathlib.Path(self.path) if not checkpoints_path.exists(): checkpoints_path.mkdir() checkpoint_path = checkpoints_path / str(global_step) assert not checkpoint_path.exists() checkpoint_path.mkdir() info = {} for name, saver in self.model_savers.items(): info[name] = saver.save(checkpoint_path) # Save header with open(str(checkpoint_path / "info.json"), "w") as f: f.write(json.dumps(info))
def load_models(self, *, models: List[str], run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if checkpoint is None: checkpoint = -1 checkpoint_path, global_step = experiment_run.get_run_checkpoint(run_uuid, checkpoint) if global_step is None: labml_notice(['Could not find saved checkpoint'], is_danger=True) return with monit.section("Loading checkpoint"): self.checkpoint_saver.load(checkpoint_path, models)
def _process(self, packets: List[Packet]) -> bool: if not packets: return True data = [p.data for p in packets] try: response = self._send(data) except urllib.error.HTTPError as e: labml_notice([ f'Failed to send to {self.url}: ', (str(e.code), Text.value), '\n' + str(e.reason) ]) return False except urllib.error.URLError as e: labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)]) return False except socket.timeout as e: labml_notice([f'{self.url} timeout\n', str(e)]) return False except ConnectionResetError as e: labml_notice( [f'Connection reset by LabML App server {self.url}\n', str(e)]) return False for h in self.handlers: if h.handle(response): break return True
def __init__(self, session_uuid: str): api_caller = ApiCaller(computer_singleton().web_api.url, {'computer_uuid': computer_singleton().uuid, 'session_uuid': session_uuid}, timeout_seconds=15, daemon=True) self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency) self.header = Header(api_caller, frequency=computer_singleton().web_api.frequency, open_browser=computer_singleton().web_api.open_browser) self.data = {} self.cache = {} self.nvml = None self.n_gpu = 0 try: from py3nvml import py3nvml as nvml self.nvml = nvml except ImportError: labml_notice('Install py3nvml to monitor GPUs:\n pip install py3nvml', is_warn=False)
def __init__(self): self.data = {} self.cache = {} self.nvml = None self.n_gpu = 0 try: from py3nvml import py3nvml as nvml self.nvml = nvml except ImportError: labml_notice( 'Install py3nvml to monitor GPUs:\n pip install py3nvml', is_warn=False) if self.nvml: try: self.nvml.nvmlInit() self.nvml.nvmlShutdown() except self.nvml.NVMLError: logger.log('NVML Library not found', Text.warning) self.nvml = None self.process_monitor = ProcessMonitor(self.nvml)
def load(self, checkpoint_path: pathlib.Path, models: List[str] = None): """ ## Load model as a set of numpy arrays """ if not self.model_savers: if not self.__no_savers_warned: labml_notice([ "No models were registered for loading or saving\n", "You can register models with ", ('experiment.add_pytorch_models', Text.value) ]) self.__no_savers_warned = True return if not models: models = list(self.model_savers.keys()) with open(str(checkpoint_path / "info.json"), "r") as f: info = json.loads(f.readline()) to_load = [] not_loaded = [] missing = [] for name in models: if name not in info: missing.append(name) else: to_load.append(name) for name in info: if name not in models: not_loaded.append(name) # Load each model for name in to_load: saver = self.model_savers[name] saver.load(checkpoint_path, info[name]) if missing: labml_notice([(f'{missing} ', Text.highlight), ('model(s) could not be found.\n'), (f'{to_load} ', Text.none), ('models were loaded.', Text.none)], is_danger=True) if not_loaded: labml_notice([(f'{not_loaded} ', Text.none), ('models were not loaded.\n', Text.none), 'Models to be loaded should be specified with: ', ('experiment.add_pytorch_models', Text.value)])
def send(self, data: Any) -> Any: try: response = self._send(data) except urllib.error.HTTPError as e: labml_notice([ f'Failed to send to {self.url}: ', (str(e.code), Text.value), '\n' + str(e.reason) ]) return None except urllib.error.URLError as e: labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)]) return None except socket.timeout as e: labml_notice([f'{self.url} timeout\n', str(e)]) return None except ConnectionResetError as e: labml_notice( [f'Connection reset by LabML App server {self.url}\n', str(e)]) return None return response
def _process(self, packets: List[Packet]) -> bool: callback = None data = [] for p in packets: if p.callback is not None: if callback is not None: raise RuntimeError('Multiple callbacks') callback = p.callback data.append(p.data) if not data: return True try: callback_url = self._send(data) except urllib.error.HTTPError as e: labml_notice([ f'Failed to send to {self.url}: ', (str(e.code), Text.value), '\n' + str(e.reason) ]) return False except urllib.error.URLError as e: labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)]) return False except socket.timeout as e: labml_notice([f'{self.url} timeout\n', str(e)]) return False except ConnectionResetError as e: labml_notice( [f'Connection reset by LabML App server {self.url}\n', str(e)]) return False if callback is not None: callback(callback_url) return True
def _process(self, packet: Packet) -> bool: data = packet.data remove = [ k for k in data if k in self.state_attributes and self.key_idx[k] != packet.idx ] for k in remove: del data[k] if not data: assert packet.callback is None return True try: run_url = self._send(data) except urllib.error.HTTPError as e: labml_notice([ f'Failed to send to {self.url}: ', (str(e.code), Text.value), '\n' + str(e.reason) ]) return False except urllib.error.URLError as e: labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)]) return False except socket.timeout as e: labml_notice([f'{self.url} timeout\n', str(e)]) return False except ConnectionResetError as e: labml_notice( [f'Connection reset by LabML App server {self.url}\n', str(e)]) return False if packet.callback is not None: packet.callback(run_url) return True
def __init__(self, *, uuid: str, name: Optional[str], python_file: Optional[str], comment: Optional[str], writers: Set[str], ignore_callers: Set[str], tags: Optional[Set[str]], is_evaluate: bool): if is_ipynb(): lab_singleton().set_path(os.getcwd()) if python_file is None: python_file = 'notebook.ipynb' if name is None: name = 'Notebook Experiment' else: if python_file is None: python_file = get_caller_file(ignore_callers) lab_singleton().set_path(python_file) if name is None: file_path = pathlib.PurePath(python_file) name = file_path.stem if comment is None: comment = '' if global_params_singleton().comment is not None: comment = global_params_singleton().comment self.experiment_path = lab_singleton().experiments / name self.check_repo_dirty = lab_singleton().check_repo_dirty self.configs_processor = None if tags is None: tags = set(name.split('_')) self.run = Run.create( uuid=uuid, experiment_path=self.experiment_path, python_file=python_file, trial_time=time.localtime(), name=name, comment=comment, tags=list(tags)) try: repo = git.Repo(lab_singleton().path) self.run.repo_remotes = list(repo.remote().urls) self.run.commit = repo.head.commit.hexsha self.run.commit_message = repo.head.commit.message.strip() self.run.is_dirty = repo.is_dirty() self.run.diff = repo.git.diff() except git.InvalidGitRepositoryError: if not is_colab() and not is_kaggle(): labml_notice(["Not a valid git repository: ", (lab_singleton().path, Text.value)]) self.run.commit = 'unknown' self.run.commit_message = '' self.run.is_dirty = True self.run.diff = '' self.checkpoint_saver = CheckpointSaver(self.run.checkpoint_path) self.is_evaluate = is_evaluate self.web_api = None self.writers = writers self.is_started = False self.distributed_rank = 0 self.distributed_world_size = -1