Esempio n. 1
0
    def start(self):
        if self.web_api is None:
            return

        data = {
            'run_uuid': self.run_uuid,
            'name': self.name,
            'comment': self.comment,
            'time': time.time(),
            'configs': {}
        }

        if self.configs is not None:
            for k, v in self.configs.items():
                data['configs'][k] = v

        self.last_committed = time.time()
        self.commits_count = 0
        url = self.send(data)
        if url is None:
            return None

        logger.log([('Monitor experiment at ', Text.meta),
                    (url, Text.highlight)])
        if self.web_api.open_browser:
            webbrowser.open(url)
Esempio n. 2
0
    def calc_configs_dict(self,
                          configs: Dict[str, any],
                          configs_override: Optional[Dict[str, any]]):
        self.configs_processor = ConfigProcessorDict(configs, configs_override)
        self.configs_processor()

        logger.log()
Esempio n. 3
0
    def sample(self):
        """
        ### Sampling function to generate samples periodically while training
        """

        # Starting prompt
        prompt = 'It is'
        # Collect output for printing
        log = [(prompt, Text.subtle)]
        # Sample 25 tokens
        for i in monit.iterate('Sample', 25):
            # Tokenize the prompt
            data = self.dataset.text_to_i(prompt).unsqueeze(-1)
            data = data.to(self.device)
            # Get the model output
            output = self.model(data)
            # Get the model prediction (greedy)
            output = output.argmax(dim=-1).squeeze()
            # Add the prediction to prompt
            prompt += self.dataset.itos[output[-1].item()]
            # Add the prediction for logging
            log += [(self.dataset.itos[output[-1].item()], Text.value)]

        # Print the sampled output
        logger.log(log)
Esempio n. 4
0
    def _started(self, url):
        if url is None:
            return None

        logger.log([('Monitor experiment at ', Text.meta), (url, Text.link)])
        if self.open_browser:
            webbrowser.open(url)
Esempio n. 5
0
 def run(self):
     epoch = 1
     while epoch <= self.epochs():
         self.train()
         self.validate()
         logger.log()
         epoch += 1
Esempio n. 6
0
    def sample(self):
        """
        ### Sampling function to generate samples periodically while training
        """

        # Starting prompt
        prompt = self.prompt
        # Collect output for printing
        log = [(prompt, Text.subtle)]
        # memory
        mem = []
        # Sample 25 tokens
        for i in monit.iterate('Sample', 25):
            # Tokenize the prompt
            data = self.text.text_to_i(prompt).unsqueeze(-1)
            # Move to device
            data = data.to(self.device)
            # Get the model output
            output, new_mem = self.model(data, mem)
            # Get the model prediction (greedy)
            output = output.argmax(dim=-1).squeeze(1)
            # Add the prediction to prompt
            prompt += self.prompt_separator + self.text.itos[output[-1]]
            # Only feed the last character to model in next iteration, rest will go in as memories
            prompt = prompt[-1:]
            # Add the prediction for logging
            log += [(self.prompt_separator + self.text.itos[output[-1]],
                     Text.value)]
            # Update memory
            mem = self.merge_memory(mem, new_mem)

        # Print the sampled output
        logger.log(log)
Esempio n. 7
0
    def solve(self):
        for t in monit.loop(self.epochs):
            if not self.is_online_update:
                for I in self.info_sets.values():
                    I.clear()
            for i in range(self.n_players):
                self.cfr(self.create_new_history(), cast(Player, i),
                         [1 for _ in range(self.n_players)])
            if not self.is_online_update:
                self.update()
            with monit.section("Track"):
                for I in self.info_sets.values():
                    for a in I.actions():
                        tracker.add({
                            f'strategy.{I.key}.{a}': I.strategy[a],
                            f'average_strategy.{I.key}.{a}': I.average_strategy[a],
                            f'regret.{I.key}.{a}': I.regret[a],
                            f'current_regret.{I.key}.{a}': I.current_regret[a]
                        })

            if t % self.track_frequency == 0:
                tracker.save()
                logger.log()

            if (t + 1) % self.save_frequency == 0:
                experiment.save_checkpoint()

        logger.inspect(self.info_sets)
Esempio n. 8
0
def load_bundle(path: Path, url: Optional[str] = None) -> Tuple[str, int]:
    if url:
        download_file(url, path)

    if not path.exists():
        raise FileNotFoundError(f'Bundle archive missing: {path}')

    with monit.section('Extract bundle'):
        with tarfile.open(str(path), 'r:gz') as tar:
            files = tar.getmembers()
            info_member = None
            for f in files:
                if f.name == 'info.json':
                    info_member = f

            if not info_member:
                raise RuntimeError(f"Corrupted bundle. Missing info.json")

            with tar.extractfile(info_member) as ef:
                info = json.load(ef)

            run_uuid, checkpoint = info['uuid'], info['checkpoint']
            run_path = get_run_by_uuid(lab.get_experiments_path(), run_uuid)

            if run_path is not None:
                logger.log(f"Run {run_uuid} exists", Text.meta)
                current_checkpoint = _get_run_checkpoint(run_path, checkpoint)
                if checkpoint == current_checkpoint:
                    logger.log(f"Checkpoint {checkpoint} exists", Text.meta)
                    return run_uuid, checkpoint

            run_path = lab.get_experiments_path() / 'bundled' / run_uuid

            checkpoint_path = run_path / "checkpoints" / str(checkpoint)
            if not checkpoint_path.exists():
                checkpoint_path.mkdir(parents=True)

            data_path = lab.get_data_path()
            if not data_path.exists():
                data_path.mkdir(parents=True)

            for f in files:
                if f.name == 'run.yaml':
                    _extract_tar_file(tar, f, run_path / 'run.yaml')
                elif f.name == 'configs.yaml':
                    _extract_tar_file(tar, f, run_path / 'configs.yaml')
                elif f.name.startswith('checkpoint/'):
                    p = f.name[len('checkpoint/'):]
                    p = checkpoint_path / p
                    if not p.parent.exists():
                        p.parent.mkdir(parents=True)
                    _extract_tar_file(tar, f, p)
                elif f.name.startswith('data/'):
                    p = f.name[len('data/'):]
                    p = data_path / p
                    if not p.parent.exists():
                        p.parent.mkdir(parents=True)
                    _extract_tar_file(tar, f, p)

            return run_uuid, checkpoint
def _read_file(path: Path) -> List[int]:
    """
    Read and encode a file
    """
    try:
        with open(str(path)) as f:
            content = f.read()

        parsed = parse_string(content)
        parsed = _remove_comments(parsed)
        parsed = _remove_empty_lines(parsed)
        parsed = _fix_indentation(parsed)
        serialized = encode(parsed)

        # deserialized = tokenizer.deserialize(serialized)
        # for i in range(len(serialized)):
        #     assert deserialized[i] == parsed[i]
        #
        # res = to_text(deserialized)
        # print(res)

        return serialized
    except:
        logger.log()
        return None
Esempio n. 10
0
def complete(predictor: Predictor, text: str, completion: int):
    line_no = 1
    logs = [(f"{line_no: 4d}: ", Text.meta), (text[0], Text.subtle)]

    i = 0
    given = len(text)

    while i + 1 < given + completion:
        if len(text) > i + 1:
            c = text[i + 1]
        else:
            c = predictor.get_next_char(text[:i + 1])

        if c == '\n':
            logger.log(logs)
            line_no += 1
            logs = [(f"{line_no: 4d}: ", Text.meta)]
        elif c != '\r':
            if len(text) > i + 1:
                logs.append(c)
            else:
                logs.append((c, [Style.bold]))

        if len(text) <= i + 1:
            text += c

        i += 1

    logger.log(logs)
Esempio n. 11
0
    def __print_info(self):
        """
        🖨 Print the experiment info and check git repo status
        """

        logger.log()
        logger.log([
            (self.name, Text.title),
            ': ',
            (str(self.run.uuid), Text.meta)
        ])

        if self.run.comment != '':
            logger.log(['\t', (self.run.comment, Text.highlight)])

        commit_message = self.run.commit_message.strip().replace('\n', '¶ ').replace('\r', '')
        logger.log([
            "\t"
            "[dirty]" if self.run.is_dirty else "[clean]",
            ": ",
            (f"\"{commit_message}\"", Text.highlight)
        ])

        if self.run.load_run is not None:
            logger.log([
                "\t"
                "loaded from",
                ": ",
                (f"{self.run.load_run}", Text.meta2),
            ])
Esempio n. 12
0
def _launch(args: List[str]):
    import sys
    import os

    if 'RUN_UUID' not in os.environ:
        os.environ['RUN_UUID'] = experiment.generate_uuid()

    cwd = os.getcwd()
    if 'PYTHONPATH' in os.environ:
        python_path = os.environ['PYTHONPATH']
        print(python_path)
        os.environ['PYTHONPATH'] = f"{python_path}:{cwd}:{cwd}/src"
    else:
        os.environ['PYTHONPATH'] = f"{cwd}:{cwd}/src"

    cmd = [sys.executable, '-u', '-m', 'torch.distributed.launch', *args]
    print(cmd)
    try:
        process = subprocess.Popen(cmd, env=os.environ)
        process.wait()
    except Exception as e:
        logger.log('Error starting launcher', Text.danger)
        raise e

    if process.returncode != 0:
        logger.log('Launcher failed', Text.danger)
        raise subprocess.CalledProcessError(returncode=process.returncode,
                                            cmd=cmd)
Esempio n. 13
0
    def calc_configs(self,
                     configs: Configs,
                     configs_override: Optional[Dict[str, any]],
                     run_order: Optional[List[Union[List[str], str]]]):
        self.configs_processor = ConfigProcessor(configs, configs_override)
        self.configs_processor(run_order)

        logger.log()
Esempio n. 14
0
    def start(self,
              *,
              run_uuid: Optional[str] = None,
              checkpoint: Optional[int] = None):
        if run_uuid is not None:
            if checkpoint is None:
                checkpoint = -1
            global_step = self.__start_from_checkpoint(run_uuid, checkpoint)
        else:
            global_step = 0

        self.run.start_step = global_step

        self._start_tracker()
        tracker().set_start_global_step(global_step)

        if self.distributed_rank == 0:
            self.__print_info()
            if self.check_repo_dirty and self.run.is_dirty:
                logger.log([
                    ("[FAIL]", Text.danger),
                    " Cannot trial an experiment with uncommitted changes."
                ])
                exit(1)

        if not self.is_evaluate:
            if self.distributed_rank == 0:
                from labml.internal.computer.configs import computer_singleton
                computer_singleton().add_project(lab_singleton().path)

                self.run.save_info()
            self._save_pid()

            if self.distributed_rank == 0:
                if self.configs_processor is not None:
                    self.configs_processor.add_saver(
                        FileConfigsSaver(self.run.configs_path))

                if self.web_api is not None:
                    self.web_api.start(self.run)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.web_api.get_configs_saver())
                        self.web_api.set_dynamic_handler(
                            ExperimentDynamicUpdateHandler(
                                self.configs_processor))

                if self.wandb is not None:
                    self.wandb.init(self.run.name, self.run.run_path)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.wandb.get_configs_saver())

                tracker().save_indicators(self.run.indicators_path)

        self.is_started = True
        return ExperimentWatcher(self)
Esempio n. 15
0
def _start_app_server():
    try:
        import labml_app
    except (ImportError, ModuleNotFoundError):
        logger.log("Cannot import ", ('labml_app', Text.highlight), '.')
        logger.log('Install with ', ('pip install labml-app', Text.value))
        return

    labml_app.start_server()
Esempio n. 16
0
def _open_dashboard():
    try:
        import labml_dashboard
    except (ImportError, ModuleNotFoundError):
        logger.log("Cannot import ", ('labml_dashboard', Text.highlight), '.')
        logger.log('Install with ',
                   ('pip install labml_dashboard', Text.value))
        return

    labml_dashboard.start_server()
    def handler(self, sig, frame):
        # Pass second interrupt without delaying
        if self.signal_received is not None:
            self.old_handler(*self.signal_received)
            return

        # Store the interrupt signal for later
        self.signal_received = (sig, frame)
        logger.log([('\nSIGINT received. Delaying KeyboardInterrupt.',
                     Text.danger)])
Esempio n. 18
0
 def __finish(self):
     try:
         signal.signal(signal.SIGINT, self.old_handler)
     except ValueError:
         pass
     tracker.save()
     tracker.new_line()
     if self.__is_save_models:
         logger.log("Saving model...")
         experiment.save_checkpoint()
Esempio n. 19
0
    def __load_configs(self):
        if self.config_folder.is_file():
            self.config_folder.unlink()

        if not self.config_folder.exists():
            self.config_folder.mkdir(parents=True)

        if not self.projects_folder.exists():
            self.projects_folder.mkdir()

        if not self.app_folder.exists():
            self.app_folder.mkdir()

        if not self.runs_cache.exists():
            self.runs_cache.mkdir()

        if self.configs_file.exists():
            with open(str(self.configs_file)) as f:
                config = util.yaml_load(f.read())
                if config is None:
                    config = {}
        else:
            logger.log([('~/labml/configs.yaml', Text.value),
                        ' does not exist. Creating ',
                        (str(self.configs_file), Text.meta)])
            config = {}

        if 'uuid' not in config:
            from uuid import uuid1
            config['uuid'] = uuid1().hex
            with open(str(self.configs_file), 'w') as f:
                f.write(util.yaml_dump(config))

        default_config = self.__default_config()
        for k, v in default_config.items():
            if k not in config:
                config[k] = v

        self.uuid = config['uuid']
        web_api_url = config['web_api']
        if web_api_url[0:4] != 'http':
            web_api_url = f"https://api.labml.ai/api/v1/computer?labml_token={web_api_url}&"
        self.web_api = WebAPIConfigs(
            url=web_api_url,
            frequency=config['web_api_frequency'],
            verify_connection=config['web_api_verify_connection'],
            open_browser=config['web_api_open_browser'],
            is_default=web_api_url == self.__default_config()['web_api'])
        self.web_api_sync = config['web_api_sync']
        self.web_api_polling = config['web_api_polling']

        self.tensorboard_port = config['tensorboard_port']
        self.tensorboard_visible_port = config['tensorboard_visible_port']
        self.tensorboard_host = config['tensorboard_host']
        self.tensorboard_protocol = config['tensorboard_protocol']
Esempio n. 20
0
def main():
    # Configurations
    configs = {
        'epochs': 10,
        'train_batch_size': 64,
        'valid_batch_size': 100,
        'use_cuda': True,
        'seed': 5,
        'train_log_interval': 10,
        'learning_rate': 0.01,
    }

    is_cuda = configs['use_cuda'] and torch.cuda.is_available()
    if not is_cuda:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:0")

    train_loader = torch.utils.data.DataLoader(
        RemoteDataset('mnist_train'),
        batch_size=configs['train_batch_size'],
        shuffle=True,
        num_workers=4)

    valid_loader = torch.utils.data.DataLoader(
        RemoteDataset('mnist_valid'),
        batch_size=configs['valid_batch_size'],
        shuffle=False,
        num_workers=4)

    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=configs['learning_rate'])

    torch.manual_seed(configs['seed'])

    # ✨ Create the experiment
    experiment.create(name='mnist_labml_monit')

    # ✨ Save configurations
    experiment.configs(configs)

    # ✨ Set PyTorch models for checkpoint saving and loading
    experiment.add_pytorch_models(dict(model=model))

    # ✨ Start and monitor the experiment
    with experiment.start():
        for _ in monit.loop(range(1, configs['epochs'] + 1)):
            train(model, optimizer, train_loader, device,
                  configs['train_log_interval'])
            validate(model, valid_loader, device)
            logger.log()

    # save the model
    experiment.save_checkpoint()
Esempio n. 21
0
def job_rsync(server: str, delay: int, show_output: bool):
    """RSync job outputs from server"""
    for k in util.get_servers(server):
        SERVERS[k].rsync_jobs(
            ui_mode=UIMode.full if show_output else UIMode.dots)
    if delay > 0:
        while True:
            logger.log('Watching...', Text.meta)
            time.sleep(delay)
            for k in util.get_servers(server):
                SERVERS[k].rsync_jobs()
Esempio n. 22
0
def evaluate(predictor: Predictor, text: str):
    line_no = 1
    logs = [(f"{line_no: 4d}: ", Text.meta), (text[0], Text.subtle)]

    correct = 0
    i = 0
    key_strokes = 0

    while i + 1 < len(text):
        prefix = text[:i + 1]
        stripped, prompt = predictor.rstrip(prefix)
        rest = prefix[len(stripped):]
        prediction_complete = NextWordPredictionComplete(rest, 5)
        prompt = torch.tensor(prompt, dtype=torch.long).unsqueeze(-1)

        predictions = predictor.get_next_word(prompt, None, rest, [1.],
                                              prediction_complete, 5)
        predictions.sort(key=lambda x: -x[0])
        if predictions:
            next_token = predictions[0].text[len(rest):]
        else:
            next_token = ''

        if next_token and next_token == text[i + 1:i + 1 + len(next_token)]:
            correct += len(next_token)
            right = True
        else:
            next_token = text[i + 1]
            right = False

        for j, c in enumerate(next_token):
            if c == '\n':
                logger.log(logs)
                line_no += 1
                logs = [(f"{line_no: 4d}: ", Text.meta)]
            elif c == '\r':
                continue
            else:
                if right:
                    if j == 0:
                        logs.append((c, [Text.meta, Style.underline]))
                    else:
                        logs.append((c, [Text.success, Style.underline]))
                else:
                    logs.append((c, [Text.warning]))

        i += len(next_token)
        key_strokes += 1

    logger.log(logs)

    logger.inspect(accuracy=correct / (len(text) - 1),
                   key_strokes=key_strokes,
                   length=len(text))
Esempio n. 23
0
    def calc_configs(self, configs: Union[Configs, Dict[str, any]],
                     configs_override: Optional[Dict[str, any]]):
        if configs_override is None:
            configs_override = {}
        if global_params_singleton().configs is not None:
            configs_override.update(global_params_singleton().configs)

        self.configs_processor = ConfigProcessor(configs, configs_override)

        if self.distributed_rank == 0:
            logger.log()
Esempio n. 24
0
def device(c: Configs):
    is_cuda = c.use_cuda and torch.cuda.is_available()
    if not is_cuda:
        return torch.device('cpu')
    else:
        if c.cuda_device < torch.cuda.device_count():
            return torch.device('cuda', c.cuda_device)
        else:
            logger.log(f"Cuda device index {c.cuda_device} higher than "
                       f"device count {torch.cuda.device_count()}", Text.warning)
            return torch.device('cuda', torch.cuda.device_count() - 1)
Esempio n. 25
0
def get_configs(run_uuid: str):
    exp_name = find_experiment(run_uuid)
    if exp_name is None:
        logger.log("Couldn't find a previous run")
        return None

    run_path = lab_singleton().experiments / exp_name / run_uuid
    configs_path = run_path / "configs.yaml"
    configs = load_configs(configs_path)

    return configs
Esempio n. 26
0
    def _print_artifacts_list(table: Dict[str, int],
                              artifacts: Dict[str, Artifact]):
        order = list(table.keys())
        if not len(order):
            return

        keys = {k for name in order for k in artifacts[name].keys()}
        for k in keys:
            for name in order:
                value = artifacts[name].get_string(k, artifacts)
                logger.log([(name, Text.key), ": ", (value, Text.value)])
Esempio n. 27
0
    def run(self):
        tracker.set_text('text_artifact', is_print=True)
        tracker.set_indexed_text('ti', is_print=True)
        tracker.set_indexed_text('other', is_print=True)
        for i in monit.loop(self.epochs):
            tracker.add('text_artifact', f'sample {i}')
            for j in range(5):
                tracker.add('ti', (f'{j}', 'text' * 5 + f'text {i} {j}'))
                tracker.add('other', (f'{j}', f'other {j}'))

            tracker.save()
            logger.log()
Esempio n. 28
0
def test_nvidia_device(idx: int):
    from py3nvml import py3nvml as nvml

    handle = nvml.nvmlDeviceGetHandleByIndex(idx)

    pciInfo = nvml.nvmlDeviceGetPciInfo(handle)

    brands = {
        nvml.NVML_BRAND_UNKNOWN: "Unknown",
        nvml.NVML_BRAND_QUADRO: "Quadro",
        nvml.NVML_BRAND_TESLA: "Tesla",
        nvml.NVML_BRAND_NVS: "NVS",
        nvml.NVML_BRAND_GRID: "Grid",
        nvml.NVML_BRAND_GEFORCE: "GeForce"
    }

    inspect(
        idx=idx,
        # id=pciInfo.busId,
        # uuid=nvml.nvmlDeviceGetUUID(handle),
        name=nvml.nvmlDeviceGetName(handle),
        # brand=brands[nvml.nvmlDeviceGetBrand(handle)],
        # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle),
        # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle),
        fan=nvml.nvmlDeviceGetFanSpeed(handle),
        # power=nvml.nvmlDeviceGetPowerState(handle),
        mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total,
        mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used,
        util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory,
        temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU),
        power=nvml.nvmlDeviceGetPowerUsage(handle),
        power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle),

        # display=nvml.nvmlDeviceGetDisplayMode(handle),
        display_active=nvml.nvmlDeviceGetDisplayActive(handle),
    )

    logger.log()

    procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    logger.log()
Esempio n. 29
0
    def eval(self):
        keys_saved = 0

        for line, content in enumerate(self.__content):
            # Keep reference to rest of the line
            rest_of_line = content

            # Build the line for logging with colors
            # The line number
            logs = [(f"{line: 4d}: ", Text.meta)]

            # Type the line character by character
            while rest_of_line != '':
                suggestion = self.__predictor.get_suggestion()

                # If suggestion matches
                if suggestion != '' and rest_of_line.startswith(suggestion):
                    # Log
                    logs.append((suggestion[0], [Style.underline,
                                                 Text.danger]))
                    logs.append((suggestion[1:], Style.underline))

                    keys_saved += len(suggestion) - 1

                    # Skip the prediction text
                    rest_of_line = rest_of_line[len(suggestion):]

                    # Add text to the predictor
                    self.__predictor.add(suggestion)

                # If the suggestion doesn't match
                else:
                    # Add the next character
                    self.__predictor.add(rest_of_line[0])
                    logs.append((rest_of_line[0], Text.subtle))
                    rest_of_line = rest_of_line[1:]

            # Add a new line
            self.__predictor.add("\n")

            # Log the line
            logger.log(logs)

        # Log time taken for the file
        logger.inspect(add=self.__predictor.time_add,
                       check=self.__predictor.time_check,
                       predict=self.__predictor.time_predict)

        total_keys = sum([len(c) for c in self.__content])
        logger.inspect(keys_saved=keys_saved,
                       percentage_saved=100 * keys_saved / total_keys,
                       total_keys=total_keys,
                       total_lines=len(self.__content))
Esempio n. 30
0
    def start(self,
              *,
              run_uuid: Optional[str] = None,
              checkpoint: Optional[int] = None):
        if run_uuid is not None:
            if checkpoint is None:
                checkpoint = -1
            global_step = self.__start_from_checkpoint(run_uuid, checkpoint)
        else:
            global_step = 0

        self.run.start_step = global_step

        self._start_tracker()
        tracker().set_start_global_step(global_step)

        if self.distributed_rank == 0:
            self.__print_info()
            if self.check_repo_dirty and self.run.is_dirty:
                logger.log([
                    ("[FAIL]", Text.danger),
                    " Cannot trial an experiment with uncommitted changes."
                ])
                exit(1)

        if not self.is_evaluate:
            if self.distributed_rank == 0:
                self.run.save_info()
            self._save_pid()

            if self.distributed_rank == 0:
                if self.configs_processor is not None:
                    self.configs_processor.add_saver(
                        FileConfigsSaver(self.run.configs_path))

                if self.web_api is not None:
                    self.web_api.set_info(run_uuid=self.run.uuid,
                                          name=self.run.name,
                                          comment=self.run.comment)
                    self.web_api.start()
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.web_api.get_configs_saver())

                tracker().save_indicators(self.run.indicators_path)

                # PERF: Writing to tensorboard takes about 4 seconds
                # Also wont work when configs are updated live
                # if self.configs_processor:
                #     tracker().write_h_parameters(self.configs_processor.get_hyperparams())

        self.is_started = True
        return ExperimentWatcher(self)