Ejemplo n.º 1
0
    def _start_tracker(self):
        tracker().reset_writers()

        if self.is_evaluate:
            return

        if self.distributed_rank != 0:
            return

        if 'screen' in self.writers:
            from labml.internal.tracker.writers import screen
            tracker().add_writer(screen.ScreenWriter())

        if 'sqlite' in self.writers:
            from labml.internal.tracker.writers import sqlite
            tracker().add_writer(
                sqlite.Writer(self.run.sqlite_path, self.run.artifacts_folder))

        if 'tensorboard' in self.writers:
            from labml.internal.tracker.writers import tensorboard
            tracker().add_writer(
                tensorboard.Writer(self.run.tensorboard_log_path))

        if 'wandb' in self.writers:
            from labml.internal.tracker.writers import wandb
            self.wandb = wandb.Writer()
            tracker().add_writer(self.wandb)
        else:
            self.wandb = None

        if 'comet' in self.writers:
            from labml.internal.tracker.writers import comet
            self.comet = comet.Writer()
            tracker().add_writer(self.comet)
        else:
            self.comet = None

        if 'file' in self.writers:
            from labml.internal.tracker.writers import file
            tracker().add_writer(file.Writer(self.run.log_file))

        if 'web_api' in self.writers:
            web_api_conf = lab_singleton().web_api
            if web_api_conf is not None:
                from labml.internal.tracker.writers import web_api
                from labml.internal.api import ApiCaller
                from labml.internal.api.experiment import ApiExperiment
                api_caller = ApiCaller(web_api_conf.url,
                                       {'run_uuid': self.run.uuid},
                                       timeout_seconds=120)
                self.web_api = ApiExperiment(
                    api_caller,
                    frequency=web_api_conf.frequency,
                    open_browser=web_api_conf.open_browser)
                tracker().add_writer(
                    web_api.Writer(api_caller,
                                   frequency=web_api_conf.frequency))
        else:
            self.web_api = None
Ejemplo n.º 2
0
def find_experiment(run_uuid: str) -> Optional[str]:
    experiments_path = lab_singleton().experiments
    experiments = get_experiments(experiments_path)
    for exp_name in experiments:
        run_path = experiments_path / exp_name / run_uuid
        if Path(run_path).exists():
            return exp_name

    return None
Ejemplo n.º 3
0
    def start(self,
              *,
              run_uuid: Optional[str] = None,
              checkpoint: Optional[int] = None):
        if run_uuid is not None:
            if checkpoint is None:
                checkpoint = -1
            global_step = self.__start_from_checkpoint(run_uuid, checkpoint)
        else:
            global_step = 0

        self.run.start_step = global_step

        self._start_tracker()
        tracker().set_start_global_step(global_step)

        if self.distributed_rank == 0:
            self.__print_info()
            if self.check_repo_dirty and self.run.is_dirty:
                logger.log([
                    ("[FAIL]", Text.danger),
                    " Cannot trial an experiment with uncommitted changes."
                ])
                exit(1)

        if not self.is_evaluate:
            if self.distributed_rank == 0:
                from labml.internal.computer.configs import computer_singleton
                computer_singleton().add_project(lab_singleton().path)

                self.run.save_info()
            self._save_pid()

            if self.distributed_rank == 0:
                if self.configs_processor is not None:
                    self.configs_processor.add_saver(
                        FileConfigsSaver(self.run.configs_path))

                if self.web_api is not None:
                    self.web_api.start(self.run)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.web_api.get_configs_saver())
                        self.web_api.set_dynamic_handler(
                            ExperimentDynamicUpdateHandler(
                                self.configs_processor))

                if self.wandb is not None:
                    self.wandb.init(self.run.name, self.run.run_path)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.wandb.get_configs_saver())

                tracker().save_indicators(self.run.indicators_path)

        self.is_started = True
        return ExperimentWatcher(self)
Ejemplo n.º 4
0
 def reset_store(self):
     self.indicators = {}
     self.dot_indicators = {}
     self.__indicators_file = None
     self.namespaces = []
     self.is_indicators_updated = True
     try:
         for ind in lab_singleton().indicators:
             self.add_indicator(load_indicator_from_dict(ind))
     except LabYamlNotfoundError:
         pass
Ejemplo n.º 5
0
def get_configs(run_uuid: str):
    exp_name = find_experiment(run_uuid)
    if exp_name is None:
        logger.log("Couldn't find a previous run")
        return None

    run_path = lab_singleton().experiments / exp_name / run_uuid
    configs_path = run_path / "configs.yaml"
    configs = load_configs(configs_path)

    return configs
Ejemplo n.º 6
0
def _test():
    from labml.internal.computer.configs import computer_singleton
    from labml import lab
    from labml.internal.lab import lab_singleton
    import time

    lab_singleton().set_path(
        str(Path(os.path.abspath(__file__)).parent.parent.parent.parent))

    tb = TensorBoardStarter(computer_singleton().tensorboard_symlink_dir)

    # for k, v in os.environ.items():
    #     print(k, v)

    res = tb.start([
        lab.get_path() / 'logs' / 'sample' /
        '68233e98cb5311eb9aa38d17b08f3a1d',
    ])

    print(res)

    time.sleep(100)
Ejemplo n.º 7
0
def get_run_checkpoint(run_uuid: str, checkpoint: int = -1):
    exp_name = find_experiment(run_uuid)
    if exp_name is None:
        logger.log("Couldn't find a previous run")
        return None, None

    run_path = lab_singleton().experiments / exp_name / run_uuid

    checkpoint = _get_run_checkpoint(run_path, checkpoint)

    if checkpoint is None:
        logger.log("Couldn't find checkpoints")
        return None, None

    logger.log([
        "Selected ", ("experiment", Text.key), " = ", (exp_name, Text.value),
        " ", ("run", Text.key), " = ", (run_uuid, Text.value), " ",
        ("checkpoint", Text.key), " = ", (str(checkpoint), Text.value)
    ])

    checkpoint_path = run_path / "checkpoints"
    return checkpoint_path / str(checkpoint), checkpoint
Ejemplo n.º 8
0
    def __init__(self, *,
                 uuid: str,
                 name: Optional[str],
                 python_file: Optional[str],
                 comment: Optional[str],
                 writers: Set[str],
                 ignore_callers: Set[str],
                 tags: Optional[Set[str]],
                 is_evaluate: bool):

        if is_ipynb():
            lab_singleton().set_path(os.getcwd())
            if python_file is None:
                python_file = 'notebook.ipynb'
            if name is None:
                name = 'Notebook Experiment'
        else:
            if python_file is None:
                python_file = get_caller_file(ignore_callers)

            lab_singleton().set_path(python_file)

            if name is None:
                file_path = pathlib.PurePath(python_file)
                name = file_path.stem

        if comment is None:
            comment = ''
        if global_params_singleton().comment is not None:
            comment = global_params_singleton().comment

        self.experiment_path = lab_singleton().experiments / name

        self.check_repo_dirty = lab_singleton().check_repo_dirty

        self.configs_processor = None

        if tags is None:
            tags = set(name.split('_'))

        self.run = Run.create(
            uuid=uuid,
            experiment_path=self.experiment_path,
            python_file=python_file,
            trial_time=time.localtime(),
            name=name,
            comment=comment,
            tags=list(tags))

        try:
            repo = git.Repo(lab_singleton().path)

            self.run.repo_remotes = list(repo.remote().urls)
            self.run.commit = repo.head.commit.hexsha
            self.run.commit_message = repo.head.commit.message.strip()
            self.run.is_dirty = repo.is_dirty()
            self.run.diff = repo.git.diff()
        except git.InvalidGitRepositoryError:
            if not is_colab() and not is_kaggle():
                labml_notice(["Not a valid git repository: ",
                              (lab_singleton().path, Text.value)])
            self.run.commit = 'unknown'
            self.run.commit_message = ''
            self.run.is_dirty = True
            self.run.diff = ''

        self.checkpoint_saver = CheckpointSaver(self.run.checkpoint_path)
        self.is_evaluate = is_evaluate
        self.web_api = None
        self.writers = writers
        self.is_started = False
        self.distributed_rank = 0
        self.distributed_world_size = -1
Ejemplo n.º 9
0
    def __init__(self, *,
                 name: Optional[str],
                 python_file: Optional[str],
                 comment: Optional[str],
                 writers: Set[str],
                 ignore_callers: Set[str],
                 tags: Optional[Set[str]]):
        if python_file is None:
            python_file = get_caller_file(ignore_callers)

        if python_file.startswith('<ipython'):
            assert is_ipynb()
            if name is None:
                raise ValueError("You must specify python_file or experiment name"
                                 " when creating an experiment from a python notebook.")

            lab_singleton().set_path(os.getcwd())
            python_file = 'notebook.ipynb'
        else:
            lab_singleton().set_path(python_file)

            if name is None:
                file_path = pathlib.PurePath(python_file)
                name = file_path.stem

        if comment is None:
            comment = ''

        self.name = name
        self.experiment_path = lab_singleton().experiments / name

        self.check_repo_dirty = lab_singleton().check_repo_dirty

        self.configs_processor = None

        experiment_path = pathlib.Path(self.experiment_path)
        if not experiment_path.exists():
            experiment_path.mkdir(parents=True)

        if tags is None:
            tags = set(name.split('_'))

        self.run = Run.create(
            experiment_path=self.experiment_path,
            python_file=python_file,
            trial_time=time.localtime(),
            comment=comment,
            tags=list(tags))

        repo = git.Repo(lab_singleton().path)

        self.run.commit = repo.head.commit.hexsha
        self.run.commit_message = repo.head.commit.message.strip()
        self.run.is_dirty = repo.is_dirty()
        self.run.diff = repo.git.diff()

        logger_internal().reset_writers()

        if 'sqlite' in writers:
            from labml.internal.logger.writers import sqlite
            artifacts_folder = pathlib.Path(self.run.artifacts_folder)
            if not artifacts_folder.exists():
                artifacts_folder.mkdir(parents=True)
            logger_internal().add_writer(
                sqlite.Writer(self.run.sqlite_path, self.run.artifacts_folder))
        if 'tensorboard' in writers:
            from labml.internal.logger.writers import tensorboard
            logger_internal().add_writer(tensorboard.Writer(self.run.tensorboard_log_path))

        self.checkpoint_saver = None
Ejemplo n.º 10
0
    def __init__(self, *,
                 name: Optional[str],
                 python_file: Optional[str],
                 comment: Optional[str],
                 writers: Set[str],
                 ignore_callers: Set[str],
                 tags: Optional[Set[str]]):
        if python_file is None:
            python_file = get_caller_file(ignore_callers)

        if python_file.startswith('<ipython'):
            assert is_ipynb()
            if name is None:
                raise ValueError("You must specify python_file or experiment name"
                                 " when creating an experiment from a python notebook.")

            lab_singleton().set_path(os.getcwd())
            python_file = 'notebook.ipynb'
        else:
            lab_singleton().set_path(python_file)

            if name is None:
                file_path = pathlib.PurePath(python_file)
                name = file_path.stem

        if comment is None:
            comment = ''
        if global_params_singleton().comment is not None:
            comment = global_params_singleton().comment

        self.name = name
        self.experiment_path = lab_singleton().experiments / name

        self.check_repo_dirty = lab_singleton().check_repo_dirty

        self.configs_processor = None

        experiment_path = pathlib.Path(self.experiment_path)
        if not experiment_path.exists():
            experiment_path.mkdir(parents=True)

        if tags is None:
            tags = set(name.split('_'))

        self.run = Run.create(
            experiment_path=self.experiment_path,
            python_file=python_file,
            trial_time=time.localtime(),
            comment=comment,
            tags=list(tags))

        try:
            repo = git.Repo(lab_singleton().path)

            self.run.commit = repo.head.commit.hexsha
            self.run.commit_message = repo.head.commit.message.strip()
            self.run.is_dirty = repo.is_dirty()
            self.run.diff = repo.git.diff()
        except git.InvalidGitRepositoryError:
            if not is_colab() and not is_kaggle():
                warnings.warn(f"Not a valid git repository",
                              UserWarning, stacklevel=4)
            self.run.commit = 'unknown'
            self.run.commit_message = ''
            self.run.is_dirty = True
            self.run.diff = ''

        logger_internal().reset_writers()

        if 'sqlite' in writers:
            from labml.internal.logger.writers import sqlite
            logger_internal().add_writer(
                sqlite.Writer(self.run.sqlite_path, self.run.artifacts_folder))
        if 'tensorboard' in writers:
            from labml.internal.logger.writers import tensorboard
            logger_internal().add_writer(tensorboard.Writer(self.run.tensorboard_log_path))
        if 'web_api' in writers:
            from labml.internal.logger.writers import web_api
            self.web_api = web_api.Writer()
            logger_internal().add_writer(self.web_api)
        else:
            self.web_api = None

        self.checkpoint_saver = CheckpointSaver(self.run.checkpoint_path)