def _start_tracker(self): tracker().reset_writers() if self.is_evaluate: return if self.distributed_rank != 0: return if 'screen' in self.writers: from labml.internal.tracker.writers import screen tracker().add_writer(screen.ScreenWriter()) if 'sqlite' in self.writers: from labml.internal.tracker.writers import sqlite tracker().add_writer( sqlite.Writer(self.run.sqlite_path, self.run.artifacts_folder)) if 'tensorboard' in self.writers: from labml.internal.tracker.writers import tensorboard tracker().add_writer( tensorboard.Writer(self.run.tensorboard_log_path)) if 'wandb' in self.writers: from labml.internal.tracker.writers import wandb self.wandb = wandb.Writer() tracker().add_writer(self.wandb) else: self.wandb = None if 'comet' in self.writers: from labml.internal.tracker.writers import comet self.comet = comet.Writer() tracker().add_writer(self.comet) else: self.comet = None if 'file' in self.writers: from labml.internal.tracker.writers import file tracker().add_writer(file.Writer(self.run.log_file)) if 'web_api' in self.writers: web_api_conf = lab_singleton().web_api if web_api_conf is not None: from labml.internal.tracker.writers import web_api from labml.internal.api import ApiCaller from labml.internal.api.experiment import ApiExperiment api_caller = ApiCaller(web_api_conf.url, {'run_uuid': self.run.uuid}, timeout_seconds=120) self.web_api = ApiExperiment( api_caller, frequency=web_api_conf.frequency, open_browser=web_api_conf.open_browser) tracker().add_writer( web_api.Writer(api_caller, frequency=web_api_conf.frequency)) else: self.web_api = None
def find_experiment(run_uuid: str) -> Optional[str]: experiments_path = lab_singleton().experiments experiments = get_experiments(experiments_path) for exp_name in experiments: run_path = experiments_path / exp_name / run_uuid if Path(run_path).exists(): return exp_name return None
def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step self._start_tracker() tracker().set_start_global_step(global_step) if self.distributed_rank == 0: self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([ ("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if not self.is_evaluate: if self.distributed_rank == 0: from labml.internal.computer.configs import computer_singleton computer_singleton().add_project(lab_singleton().path) self.run.save_info() self._save_pid() if self.distributed_rank == 0: if self.configs_processor is not None: self.configs_processor.add_saver( FileConfigsSaver(self.run.configs_path)) if self.web_api is not None: self.web_api.start(self.run) if self.configs_processor is not None: self.configs_processor.add_saver( self.web_api.get_configs_saver()) self.web_api.set_dynamic_handler( ExperimentDynamicUpdateHandler( self.configs_processor)) if self.wandb is not None: self.wandb.init(self.run.name, self.run.run_path) if self.configs_processor is not None: self.configs_processor.add_saver( self.wandb.get_configs_saver()) tracker().save_indicators(self.run.indicators_path) self.is_started = True return ExperimentWatcher(self)
def reset_store(self): self.indicators = {} self.dot_indicators = {} self.__indicators_file = None self.namespaces = [] self.is_indicators_updated = True try: for ind in lab_singleton().indicators: self.add_indicator(load_indicator_from_dict(ind)) except LabYamlNotfoundError: pass
def get_configs(run_uuid: str): exp_name = find_experiment(run_uuid) if exp_name is None: logger.log("Couldn't find a previous run") return None run_path = lab_singleton().experiments / exp_name / run_uuid configs_path = run_path / "configs.yaml" configs = load_configs(configs_path) return configs
def _test(): from labml.internal.computer.configs import computer_singleton from labml import lab from labml.internal.lab import lab_singleton import time lab_singleton().set_path( str(Path(os.path.abspath(__file__)).parent.parent.parent.parent)) tb = TensorBoardStarter(computer_singleton().tensorboard_symlink_dir) # for k, v in os.environ.items(): # print(k, v) res = tb.start([ lab.get_path() / 'logs' / 'sample' / '68233e98cb5311eb9aa38d17b08f3a1d', ]) print(res) time.sleep(100)
def get_run_checkpoint(run_uuid: str, checkpoint: int = -1): exp_name = find_experiment(run_uuid) if exp_name is None: logger.log("Couldn't find a previous run") return None, None run_path = lab_singleton().experiments / exp_name / run_uuid checkpoint = _get_run_checkpoint(run_path, checkpoint) if checkpoint is None: logger.log("Couldn't find checkpoints") return None, None logger.log([ "Selected ", ("experiment", Text.key), " = ", (exp_name, Text.value), " ", ("run", Text.key), " = ", (run_uuid, Text.value), " ", ("checkpoint", Text.key), " = ", (str(checkpoint), Text.value) ]) checkpoint_path = run_path / "checkpoints" return checkpoint_path / str(checkpoint), checkpoint
def __init__(self, *, uuid: str, name: Optional[str], python_file: Optional[str], comment: Optional[str], writers: Set[str], ignore_callers: Set[str], tags: Optional[Set[str]], is_evaluate: bool): if is_ipynb(): lab_singleton().set_path(os.getcwd()) if python_file is None: python_file = 'notebook.ipynb' if name is None: name = 'Notebook Experiment' else: if python_file is None: python_file = get_caller_file(ignore_callers) lab_singleton().set_path(python_file) if name is None: file_path = pathlib.PurePath(python_file) name = file_path.stem if comment is None: comment = '' if global_params_singleton().comment is not None: comment = global_params_singleton().comment self.experiment_path = lab_singleton().experiments / name self.check_repo_dirty = lab_singleton().check_repo_dirty self.configs_processor = None if tags is None: tags = set(name.split('_')) self.run = Run.create( uuid=uuid, experiment_path=self.experiment_path, python_file=python_file, trial_time=time.localtime(), name=name, comment=comment, tags=list(tags)) try: repo = git.Repo(lab_singleton().path) self.run.repo_remotes = list(repo.remote().urls) self.run.commit = repo.head.commit.hexsha self.run.commit_message = repo.head.commit.message.strip() self.run.is_dirty = repo.is_dirty() self.run.diff = repo.git.diff() except git.InvalidGitRepositoryError: if not is_colab() and not is_kaggle(): labml_notice(["Not a valid git repository: ", (lab_singleton().path, Text.value)]) self.run.commit = 'unknown' self.run.commit_message = '' self.run.is_dirty = True self.run.diff = '' self.checkpoint_saver = CheckpointSaver(self.run.checkpoint_path) self.is_evaluate = is_evaluate self.web_api = None self.writers = writers self.is_started = False self.distributed_rank = 0 self.distributed_world_size = -1
def __init__(self, *, name: Optional[str], python_file: Optional[str], comment: Optional[str], writers: Set[str], ignore_callers: Set[str], tags: Optional[Set[str]]): if python_file is None: python_file = get_caller_file(ignore_callers) if python_file.startswith('<ipython'): assert is_ipynb() if name is None: raise ValueError("You must specify python_file or experiment name" " when creating an experiment from a python notebook.") lab_singleton().set_path(os.getcwd()) python_file = 'notebook.ipynb' else: lab_singleton().set_path(python_file) if name is None: file_path = pathlib.PurePath(python_file) name = file_path.stem if comment is None: comment = '' self.name = name self.experiment_path = lab_singleton().experiments / name self.check_repo_dirty = lab_singleton().check_repo_dirty self.configs_processor = None experiment_path = pathlib.Path(self.experiment_path) if not experiment_path.exists(): experiment_path.mkdir(parents=True) if tags is None: tags = set(name.split('_')) self.run = Run.create( experiment_path=self.experiment_path, python_file=python_file, trial_time=time.localtime(), comment=comment, tags=list(tags)) repo = git.Repo(lab_singleton().path) self.run.commit = repo.head.commit.hexsha self.run.commit_message = repo.head.commit.message.strip() self.run.is_dirty = repo.is_dirty() self.run.diff = repo.git.diff() logger_internal().reset_writers() if 'sqlite' in writers: from labml.internal.logger.writers import sqlite artifacts_folder = pathlib.Path(self.run.artifacts_folder) if not artifacts_folder.exists(): artifacts_folder.mkdir(parents=True) logger_internal().add_writer( sqlite.Writer(self.run.sqlite_path, self.run.artifacts_folder)) if 'tensorboard' in writers: from labml.internal.logger.writers import tensorboard logger_internal().add_writer(tensorboard.Writer(self.run.tensorboard_log_path)) self.checkpoint_saver = None
def __init__(self, *, name: Optional[str], python_file: Optional[str], comment: Optional[str], writers: Set[str], ignore_callers: Set[str], tags: Optional[Set[str]]): if python_file is None: python_file = get_caller_file(ignore_callers) if python_file.startswith('<ipython'): assert is_ipynb() if name is None: raise ValueError("You must specify python_file or experiment name" " when creating an experiment from a python notebook.") lab_singleton().set_path(os.getcwd()) python_file = 'notebook.ipynb' else: lab_singleton().set_path(python_file) if name is None: file_path = pathlib.PurePath(python_file) name = file_path.stem if comment is None: comment = '' if global_params_singleton().comment is not None: comment = global_params_singleton().comment self.name = name self.experiment_path = lab_singleton().experiments / name self.check_repo_dirty = lab_singleton().check_repo_dirty self.configs_processor = None experiment_path = pathlib.Path(self.experiment_path) if not experiment_path.exists(): experiment_path.mkdir(parents=True) if tags is None: tags = set(name.split('_')) self.run = Run.create( experiment_path=self.experiment_path, python_file=python_file, trial_time=time.localtime(), comment=comment, tags=list(tags)) try: repo = git.Repo(lab_singleton().path) self.run.commit = repo.head.commit.hexsha self.run.commit_message = repo.head.commit.message.strip() self.run.is_dirty = repo.is_dirty() self.run.diff = repo.git.diff() except git.InvalidGitRepositoryError: if not is_colab() and not is_kaggle(): warnings.warn(f"Not a valid git repository", UserWarning, stacklevel=4) self.run.commit = 'unknown' self.run.commit_message = '' self.run.is_dirty = True self.run.diff = '' logger_internal().reset_writers() if 'sqlite' in writers: from labml.internal.logger.writers import sqlite logger_internal().add_writer( sqlite.Writer(self.run.sqlite_path, self.run.artifacts_folder)) if 'tensorboard' in writers: from labml.internal.logger.writers import tensorboard logger_internal().add_writer(tensorboard.Writer(self.run.tensorboard_log_path)) if 'web_api' in writers: from labml.internal.logger.writers import web_api self.web_api = web_api.Writer() logger_internal().add_writer(self.web_api) else: self.web_api = None self.checkpoint_saver = CheckpointSaver(self.run.checkpoint_path)