def start(self): if self.web_api is None: return data = { 'run_uuid': self.run_uuid, 'name': self.name, 'comment': self.comment, 'time': time.time(), 'configs': {} } if self.configs is not None: for k, v in self.configs.items(): data['configs'][k] = v self.last_committed = time.time() self.commits_count = 0 url = self.send(data) if url is None: return None logger.log([('Monitor experiment at ', Text.meta), (url, Text.highlight)]) if self.web_api.open_browser: webbrowser.open(url)
def calc_configs_dict(self, configs: Dict[str, any], configs_override: Optional[Dict[str, any]]): self.configs_processor = ConfigProcessorDict(configs, configs_override) self.configs_processor() logger.log()
def sample(self): """ ### Sampling function to generate samples periodically while training """ # Starting prompt prompt = 'It is' # Collect output for printing log = [(prompt, Text.subtle)] # Sample 25 tokens for i in monit.iterate('Sample', 25): # Tokenize the prompt data = self.dataset.text_to_i(prompt).unsqueeze(-1) data = data.to(self.device) # Get the model output output = self.model(data) # Get the model prediction (greedy) output = output.argmax(dim=-1).squeeze() # Add the prediction to prompt prompt += self.dataset.itos[output[-1].item()] # Add the prediction for logging log += [(self.dataset.itos[output[-1].item()], Text.value)] # Print the sampled output logger.log(log)
def _started(self, url): if url is None: return None logger.log([('Monitor experiment at ', Text.meta), (url, Text.link)]) if self.open_browser: webbrowser.open(url)
def run(self): epoch = 1 while epoch <= self.epochs(): self.train() self.validate() logger.log() epoch += 1
def sample(self): """ ### Sampling function to generate samples periodically while training """ # Starting prompt prompt = self.prompt # Collect output for printing log = [(prompt, Text.subtle)] # memory mem = [] # Sample 25 tokens for i in monit.iterate('Sample', 25): # Tokenize the prompt data = self.text.text_to_i(prompt).unsqueeze(-1) # Move to device data = data.to(self.device) # Get the model output output, new_mem = self.model(data, mem) # Get the model prediction (greedy) output = output.argmax(dim=-1).squeeze(1) # Add the prediction to prompt prompt += self.prompt_separator + self.text.itos[output[-1]] # Only feed the last character to model in next iteration, rest will go in as memories prompt = prompt[-1:] # Add the prediction for logging log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)] # Update memory mem = self.merge_memory(mem, new_mem) # Print the sampled output logger.log(log)
def solve(self): for t in monit.loop(self.epochs): if not self.is_online_update: for I in self.info_sets.values(): I.clear() for i in range(self.n_players): self.cfr(self.create_new_history(), cast(Player, i), [1 for _ in range(self.n_players)]) if not self.is_online_update: self.update() with monit.section("Track"): for I in self.info_sets.values(): for a in I.actions(): tracker.add({ f'strategy.{I.key}.{a}': I.strategy[a], f'average_strategy.{I.key}.{a}': I.average_strategy[a], f'regret.{I.key}.{a}': I.regret[a], f'current_regret.{I.key}.{a}': I.current_regret[a] }) if t % self.track_frequency == 0: tracker.save() logger.log() if (t + 1) % self.save_frequency == 0: experiment.save_checkpoint() logger.inspect(self.info_sets)
def load_bundle(path: Path, url: Optional[str] = None) -> Tuple[str, int]: if url: download_file(url, path) if not path.exists(): raise FileNotFoundError(f'Bundle archive missing: {path}') with monit.section('Extract bundle'): with tarfile.open(str(path), 'r:gz') as tar: files = tar.getmembers() info_member = None for f in files: if f.name == 'info.json': info_member = f if not info_member: raise RuntimeError(f"Corrupted bundle. Missing info.json") with tar.extractfile(info_member) as ef: info = json.load(ef) run_uuid, checkpoint = info['uuid'], info['checkpoint'] run_path = get_run_by_uuid(lab.get_experiments_path(), run_uuid) if run_path is not None: logger.log(f"Run {run_uuid} exists", Text.meta) current_checkpoint = _get_run_checkpoint(run_path, checkpoint) if checkpoint == current_checkpoint: logger.log(f"Checkpoint {checkpoint} exists", Text.meta) return run_uuid, checkpoint run_path = lab.get_experiments_path() / 'bundled' / run_uuid checkpoint_path = run_path / "checkpoints" / str(checkpoint) if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) data_path = lab.get_data_path() if not data_path.exists(): data_path.mkdir(parents=True) for f in files: if f.name == 'run.yaml': _extract_tar_file(tar, f, run_path / 'run.yaml') elif f.name == 'configs.yaml': _extract_tar_file(tar, f, run_path / 'configs.yaml') elif f.name.startswith('checkpoint/'): p = f.name[len('checkpoint/'):] p = checkpoint_path / p if not p.parent.exists(): p.parent.mkdir(parents=True) _extract_tar_file(tar, f, p) elif f.name.startswith('data/'): p = f.name[len('data/'):] p = data_path / p if not p.parent.exists(): p.parent.mkdir(parents=True) _extract_tar_file(tar, f, p) return run_uuid, checkpoint
def _read_file(path: Path) -> List[int]: """ Read and encode a file """ try: with open(str(path)) as f: content = f.read() parsed = parse_string(content) parsed = _remove_comments(parsed) parsed = _remove_empty_lines(parsed) parsed = _fix_indentation(parsed) serialized = encode(parsed) # deserialized = tokenizer.deserialize(serialized) # for i in range(len(serialized)): # assert deserialized[i] == parsed[i] # # res = to_text(deserialized) # print(res) return serialized except: logger.log() return None
def complete(predictor: Predictor, text: str, completion: int): line_no = 1 logs = [(f"{line_no: 4d}: ", Text.meta), (text[0], Text.subtle)] i = 0 given = len(text) while i + 1 < given + completion: if len(text) > i + 1: c = text[i + 1] else: c = predictor.get_next_char(text[:i + 1]) if c == '\n': logger.log(logs) line_no += 1 logs = [(f"{line_no: 4d}: ", Text.meta)] elif c != '\r': if len(text) > i + 1: logs.append(c) else: logs.append((c, [Style.bold])) if len(text) <= i + 1: text += c i += 1 logger.log(logs)
def __print_info(self): """ 🖨 Print the experiment info and check git repo status """ logger.log() logger.log([ (self.name, Text.title), ': ', (str(self.run.uuid), Text.meta) ]) if self.run.comment != '': logger.log(['\t', (self.run.comment, Text.highlight)]) commit_message = self.run.commit_message.strip().replace('\n', '¶ ').replace('\r', '') logger.log([ "\t" "[dirty]" if self.run.is_dirty else "[clean]", ": ", (f"\"{commit_message}\"", Text.highlight) ]) if self.run.load_run is not None: logger.log([ "\t" "loaded from", ": ", (f"{self.run.load_run}", Text.meta2), ])
def _launch(args: List[str]): import sys import os if 'RUN_UUID' not in os.environ: os.environ['RUN_UUID'] = experiment.generate_uuid() cwd = os.getcwd() if 'PYTHONPATH' in os.environ: python_path = os.environ['PYTHONPATH'] print(python_path) os.environ['PYTHONPATH'] = f"{python_path}:{cwd}:{cwd}/src" else: os.environ['PYTHONPATH'] = f"{cwd}:{cwd}/src" cmd = [sys.executable, '-u', '-m', 'torch.distributed.launch', *args] print(cmd) try: process = subprocess.Popen(cmd, env=os.environ) process.wait() except Exception as e: logger.log('Error starting launcher', Text.danger) raise e if process.returncode != 0: logger.log('Launcher failed', Text.danger) raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
def calc_configs(self, configs: Configs, configs_override: Optional[Dict[str, any]], run_order: Optional[List[Union[List[str], str]]]): self.configs_processor = ConfigProcessor(configs, configs_override) self.configs_processor(run_order) logger.log()
def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step self._start_tracker() tracker().set_start_global_step(global_step) if self.distributed_rank == 0: self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([ ("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if not self.is_evaluate: if self.distributed_rank == 0: from labml.internal.computer.configs import computer_singleton computer_singleton().add_project(lab_singleton().path) self.run.save_info() self._save_pid() if self.distributed_rank == 0: if self.configs_processor is not None: self.configs_processor.add_saver( FileConfigsSaver(self.run.configs_path)) if self.web_api is not None: self.web_api.start(self.run) if self.configs_processor is not None: self.configs_processor.add_saver( self.web_api.get_configs_saver()) self.web_api.set_dynamic_handler( ExperimentDynamicUpdateHandler( self.configs_processor)) if self.wandb is not None: self.wandb.init(self.run.name, self.run.run_path) if self.configs_processor is not None: self.configs_processor.add_saver( self.wandb.get_configs_saver()) tracker().save_indicators(self.run.indicators_path) self.is_started = True return ExperimentWatcher(self)
def _start_app_server(): try: import labml_app except (ImportError, ModuleNotFoundError): logger.log("Cannot import ", ('labml_app', Text.highlight), '.') logger.log('Install with ', ('pip install labml-app', Text.value)) return labml_app.start_server()
def _open_dashboard(): try: import labml_dashboard except (ImportError, ModuleNotFoundError): logger.log("Cannot import ", ('labml_dashboard', Text.highlight), '.') logger.log('Install with ', ('pip install labml_dashboard', Text.value)) return labml_dashboard.start_server()
def handler(self, sig, frame): # Pass second interrupt without delaying if self.signal_received is not None: self.old_handler(*self.signal_received) return # Store the interrupt signal for later self.signal_received = (sig, frame) logger.log([('\nSIGINT received. Delaying KeyboardInterrupt.', Text.danger)])
def __finish(self): try: signal.signal(signal.SIGINT, self.old_handler) except ValueError: pass tracker.save() tracker.new_line() if self.__is_save_models: logger.log("Saving model...") experiment.save_checkpoint()
def __load_configs(self): if self.config_folder.is_file(): self.config_folder.unlink() if not self.config_folder.exists(): self.config_folder.mkdir(parents=True) if not self.projects_folder.exists(): self.projects_folder.mkdir() if not self.app_folder.exists(): self.app_folder.mkdir() if not self.runs_cache.exists(): self.runs_cache.mkdir() if self.configs_file.exists(): with open(str(self.configs_file)) as f: config = util.yaml_load(f.read()) if config is None: config = {} else: logger.log([('~/labml/configs.yaml', Text.value), ' does not exist. Creating ', (str(self.configs_file), Text.meta)]) config = {} if 'uuid' not in config: from uuid import uuid1 config['uuid'] = uuid1().hex with open(str(self.configs_file), 'w') as f: f.write(util.yaml_dump(config)) default_config = self.__default_config() for k, v in default_config.items(): if k not in config: config[k] = v self.uuid = config['uuid'] web_api_url = config['web_api'] if web_api_url[0:4] != 'http': web_api_url = f"https://api.labml.ai/api/v1/computer?labml_token={web_api_url}&" self.web_api = WebAPIConfigs( url=web_api_url, frequency=config['web_api_frequency'], verify_connection=config['web_api_verify_connection'], open_browser=config['web_api_open_browser'], is_default=web_api_url == self.__default_config()['web_api']) self.web_api_sync = config['web_api_sync'] self.web_api_polling = config['web_api_polling'] self.tensorboard_port = config['tensorboard_port'] self.tensorboard_visible_port = config['tensorboard_visible_port'] self.tensorboard_host = config['tensorboard_host'] self.tensorboard_protocol = config['tensorboard_protocol']
def main(): # Configurations configs = { 'epochs': 10, 'train_batch_size': 64, 'valid_batch_size': 100, 'use_cuda': True, 'seed': 5, 'train_log_interval': 10, 'learning_rate': 0.01, } is_cuda = configs['use_cuda'] and torch.cuda.is_available() if not is_cuda: device = torch.device("cpu") else: device = torch.device(f"cuda:0") train_loader = torch.utils.data.DataLoader( RemoteDataset('mnist_train'), batch_size=configs['train_batch_size'], shuffle=True, num_workers=4) valid_loader = torch.utils.data.DataLoader( RemoteDataset('mnist_valid'), batch_size=configs['valid_batch_size'], shuffle=False, num_workers=4) model = Net().to(device) optimizer = optim.Adam(model.parameters(), lr=configs['learning_rate']) torch.manual_seed(configs['seed']) # ✨ Create the experiment experiment.create(name='mnist_labml_monit') # ✨ Save configurations experiment.configs(configs) # ✨ Set PyTorch models for checkpoint saving and loading experiment.add_pytorch_models(dict(model=model)) # ✨ Start and monitor the experiment with experiment.start(): for _ in monit.loop(range(1, configs['epochs'] + 1)): train(model, optimizer, train_loader, device, configs['train_log_interval']) validate(model, valid_loader, device) logger.log() # save the model experiment.save_checkpoint()
def job_rsync(server: str, delay: int, show_output: bool): """RSync job outputs from server""" for k in util.get_servers(server): SERVERS[k].rsync_jobs( ui_mode=UIMode.full if show_output else UIMode.dots) if delay > 0: while True: logger.log('Watching...', Text.meta) time.sleep(delay) for k in util.get_servers(server): SERVERS[k].rsync_jobs()
def evaluate(predictor: Predictor, text: str): line_no = 1 logs = [(f"{line_no: 4d}: ", Text.meta), (text[0], Text.subtle)] correct = 0 i = 0 key_strokes = 0 while i + 1 < len(text): prefix = text[:i + 1] stripped, prompt = predictor.rstrip(prefix) rest = prefix[len(stripped):] prediction_complete = NextWordPredictionComplete(rest, 5) prompt = torch.tensor(prompt, dtype=torch.long).unsqueeze(-1) predictions = predictor.get_next_word(prompt, None, rest, [1.], prediction_complete, 5) predictions.sort(key=lambda x: -x[0]) if predictions: next_token = predictions[0].text[len(rest):] else: next_token = '' if next_token and next_token == text[i + 1:i + 1 + len(next_token)]: correct += len(next_token) right = True else: next_token = text[i + 1] right = False for j, c in enumerate(next_token): if c == '\n': logger.log(logs) line_no += 1 logs = [(f"{line_no: 4d}: ", Text.meta)] elif c == '\r': continue else: if right: if j == 0: logs.append((c, [Text.meta, Style.underline])) else: logs.append((c, [Text.success, Style.underline])) else: logs.append((c, [Text.warning])) i += len(next_token) key_strokes += 1 logger.log(logs) logger.inspect(accuracy=correct / (len(text) - 1), key_strokes=key_strokes, length=len(text))
def calc_configs(self, configs: Union[Configs, Dict[str, any]], configs_override: Optional[Dict[str, any]]): if configs_override is None: configs_override = {} if global_params_singleton().configs is not None: configs_override.update(global_params_singleton().configs) self.configs_processor = ConfigProcessor(configs, configs_override) if self.distributed_rank == 0: logger.log()
def device(c: Configs): is_cuda = c.use_cuda and torch.cuda.is_available() if not is_cuda: return torch.device('cpu') else: if c.cuda_device < torch.cuda.device_count(): return torch.device('cuda', c.cuda_device) else: logger.log(f"Cuda device index {c.cuda_device} higher than " f"device count {torch.cuda.device_count()}", Text.warning) return torch.device('cuda', torch.cuda.device_count() - 1)
def get_configs(run_uuid: str): exp_name = find_experiment(run_uuid) if exp_name is None: logger.log("Couldn't find a previous run") return None run_path = lab_singleton().experiments / exp_name / run_uuid configs_path = run_path / "configs.yaml" configs = load_configs(configs_path) return configs
def _print_artifacts_list(table: Dict[str, int], artifacts: Dict[str, Artifact]): order = list(table.keys()) if not len(order): return keys = {k for name in order for k in artifacts[name].keys()} for k in keys: for name in order: value = artifacts[name].get_string(k, artifacts) logger.log([(name, Text.key), ": ", (value, Text.value)])
def run(self): tracker.set_text('text_artifact', is_print=True) tracker.set_indexed_text('ti', is_print=True) tracker.set_indexed_text('other', is_print=True) for i in monit.loop(self.epochs): tracker.add('text_artifact', f'sample {i}') for j in range(5): tracker.add('ti', (f'{j}', 'text' * 5 + f'text {i} {j}')) tracker.add('other', (f'{j}', f'other {j}')) tracker.save() logger.log()
def test_nvidia_device(idx: int): from py3nvml import py3nvml as nvml handle = nvml.nvmlDeviceGetHandleByIndex(idx) pciInfo = nvml.nvmlDeviceGetPciInfo(handle) brands = { nvml.NVML_BRAND_UNKNOWN: "Unknown", nvml.NVML_BRAND_QUADRO: "Quadro", nvml.NVML_BRAND_TESLA: "Tesla", nvml.NVML_BRAND_NVS: "NVS", nvml.NVML_BRAND_GRID: "Grid", nvml.NVML_BRAND_GEFORCE: "GeForce" } inspect( idx=idx, # id=pciInfo.busId, # uuid=nvml.nvmlDeviceGetUUID(handle), name=nvml.nvmlDeviceGetName(handle), # brand=brands[nvml.nvmlDeviceGetBrand(handle)], # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle), # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle), fan=nvml.nvmlDeviceGetFanSpeed(handle), # power=nvml.nvmlDeviceGetPowerState(handle), mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total, mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used, util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu, # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory, temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU), power=nvml.nvmlDeviceGetPowerUsage(handle), power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle), # display=nvml.nvmlDeviceGetDisplayMode(handle), display_active=nvml.nvmlDeviceGetDisplayActive(handle), ) logger.log() procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) logger.log()
def eval(self): keys_saved = 0 for line, content in enumerate(self.__content): # Keep reference to rest of the line rest_of_line = content # Build the line for logging with colors # The line number logs = [(f"{line: 4d}: ", Text.meta)] # Type the line character by character while rest_of_line != '': suggestion = self.__predictor.get_suggestion() # If suggestion matches if suggestion != '' and rest_of_line.startswith(suggestion): # Log logs.append((suggestion[0], [Style.underline, Text.danger])) logs.append((suggestion[1:], Style.underline)) keys_saved += len(suggestion) - 1 # Skip the prediction text rest_of_line = rest_of_line[len(suggestion):] # Add text to the predictor self.__predictor.add(suggestion) # If the suggestion doesn't match else: # Add the next character self.__predictor.add(rest_of_line[0]) logs.append((rest_of_line[0], Text.subtle)) rest_of_line = rest_of_line[1:] # Add a new line self.__predictor.add("\n") # Log the line logger.log(logs) # Log time taken for the file logger.inspect(add=self.__predictor.time_add, check=self.__predictor.time_check, predict=self.__predictor.time_predict) total_keys = sum([len(c) for c in self.__content]) logger.inspect(keys_saved=keys_saved, percentage_saved=100 * keys_saved / total_keys, total_keys=total_keys, total_lines=len(self.__content))
def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step self._start_tracker() tracker().set_start_global_step(global_step) if self.distributed_rank == 0: self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([ ("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if not self.is_evaluate: if self.distributed_rank == 0: self.run.save_info() self._save_pid() if self.distributed_rank == 0: if self.configs_processor is not None: self.configs_processor.add_saver( FileConfigsSaver(self.run.configs_path)) if self.web_api is not None: self.web_api.set_info(run_uuid=self.run.uuid, name=self.run.name, comment=self.run.comment) self.web_api.start() if self.configs_processor is not None: self.configs_processor.add_saver( self.web_api.get_configs_saver()) tracker().save_indicators(self.run.indicators_path) # PERF: Writing to tensorboard takes about 4 seconds # Also wont work when configs are updated live # if self.configs_processor: # tracker().write_h_parameters(self.configs_processor.get_hyperparams()) self.is_started = True return ExperimentWatcher(self)