Esempio n. 1
0
def run():
    pid = get_running_process()
    if pid:
        raise RuntimeError(
            f'This computer is already being monitored. PID: {pid}')

    from uuid import uuid1
    session_uuid = uuid1().hex
    with open(str(computer_singleton().config_folder / 'session.txt'),
              'w') as f:
        f.write(session_uuid)

    with open(str(computer_singleton().config_folder / 'monitor.pid'),
              'w') as f:
        f.write(str(os.getpid()))

    m = monitor.MonitorComputer(session_uuid)

    m.start({
        'os': monitor.get_os(),
        'cpu.logical': psutil.cpu_count(),
        'cpu.physical': psutil.cpu_count(logical=False)
    })

    while True:
        with monit.section('Track'):
            m.track()
        time.sleep(60)
Esempio n. 2
0
def run(is_check_process: bool = True, open_browser: bool = True):
    pid = get_running_process()
    if is_check_process and pid:
        raise RuntimeError(
            f'This computer is already being monitored. PID: {pid}')

    from uuid import uuid1
    session_uuid = uuid1().hex
    with open(str(computer_singleton().config_folder / 'session.txt'),
              'w') as f:
        f.write(session_uuid)

    with open(str(computer_singleton().config_folder / 'monitor.pid'),
              'w') as f:
        f.write(str(os.getpid()))

    m = monitor.MonitorComputer(session_uuid, open_browser)

    m.start()

    i = 0
    while True:
        with monit.section('Track', is_new_line=False):
            m.track()
        time.sleep(min(60.0, max(1.0, i / 5.0)))
        i += 1
Esempio n. 3
0
    def __init__(self):
        from labml.internal.computer.projects.api import DirectApiCaller
        from labml.internal.computer.configs import computer_singleton

        self.caller = DirectApiCaller(
            computer_singleton().web_api_polling,
            {'computer_uuid': computer_singleton().uuid},
            timeout_seconds=60)
        self.results = []
Esempio n. 4
0
    def start(self,
              *,
              run_uuid: Optional[str] = None,
              checkpoint: Optional[int] = None):
        if run_uuid is not None:
            if checkpoint is None:
                checkpoint = -1
            global_step = self.__start_from_checkpoint(run_uuid, checkpoint)
        else:
            global_step = 0

        self.run.start_step = global_step

        self._start_tracker()
        tracker().set_start_global_step(global_step)

        if self.distributed_rank == 0:
            self.__print_info()
            if self.check_repo_dirty and self.run.is_dirty:
                logger.log([
                    ("[FAIL]", Text.danger),
                    " Cannot trial an experiment with uncommitted changes."
                ])
                exit(1)

        if not self.is_evaluate:
            if self.distributed_rank == 0:
                from labml.internal.computer.configs import computer_singleton
                computer_singleton().add_project(lab_singleton().path)

                self.run.save_info()
            self._save_pid()

            if self.distributed_rank == 0:
                if self.configs_processor is not None:
                    self.configs_processor.add_saver(
                        FileConfigsSaver(self.run.configs_path))

                if self.web_api is not None:
                    self.web_api.start(self.run)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.web_api.get_configs_saver())
                        self.web_api.set_dynamic_handler(
                            ExperimentDynamicUpdateHandler(
                                self.configs_processor))

                if self.wandb is not None:
                    self.wandb.init(self.run.name, self.run.run_path)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.wandb.get_configs_saver())

                tracker().save_indicators(self.run.indicators_path)

        self.is_started = True
        return ExperimentWatcher(self)
Esempio n. 5
0
 def __init__(self, session_uuid: str, open_browser):
     api_caller = ApiCaller(computer_singleton().web_api.url,
                            {'computer_uuid': computer_singleton().uuid, 'session_uuid': session_uuid},
                            timeout_seconds=120,
                            daemon=True)
     self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency)
     self.header = Header(api_caller,
                          frequency=computer_singleton().web_api.frequency,
                          open_browser=open_browser)
     self.scanner = Scanner()
Esempio n. 6
0
 def __init__(self):
     api_caller = ApiCaller(computer_singleton().web_api.url,
                            {'computer_uuid': computer_singleton().uuid},
                            15)
     self.writer = Writer(api_caller,
                          frequency=computer_singleton().web_api.frequency)
     self.header = Header(
         api_caller,
         frequency=computer_singleton().web_api.frequency,
         open_browser=computer_singleton().web_api.open_browser)
     self.data = {}
Esempio n. 7
0
    def __init__(self):
        from labml.internal.computer.projects.api import DirectApiCaller
        from labml.internal.computer.configs import computer_singleton

        self.sync_caller = DirectApiCaller(
            computer_singleton().web_api_sync,
            {'computer_uuid': computer_singleton().uuid},
            timeout_seconds=15)

        self.projects = None
        self.runs = {}
Esempio n. 8
0
def init_db():
    data_path = computer_singleton().app_folder / 'data'

    if not data_path.exists():
        data_path.mkdir()

    if settings.IS_LOCAL_SETUP:
        Model.set_db_drivers([
            FileDbDriver(PickleSerializer(), m,
                         Path(f'{data_path}/{m.__name__}')) for s, m in Models
        ])
        Index.set_db_drivers([
            FileIndexDbDriver(YamlSerializer(), m,
                              Path(f'{data_path}/{m.__name__}.yaml'))
            for m in Indexes
        ])
    else:
        import redis
        db = redis.Redis(host='localhost', port=6379, db=0)

        Model.set_db_drivers([RedisDbDriver(s, m, db) for s, m in Models])
        Index.set_db_drivers([RedisIndexDbDriver(m, db) for m in Indexes])

    project.create_project(settings.FLOAT_PROJECT_TOKEN, 'float project')
    project.create_project(settings.SAMPLES_PROJECT_TOKEN, 'samples project')
Esempio n. 9
0
    def start(self, run: 'Run'):
        self.api_caller.add_handler(
            ApiUrlHandler(self.open_browser, 'Monitor experiment at '))

        with self.lock:
            from labml.internal.computer.configs import computer_singleton

            computer_uuid = computer_singleton().uuid

            self.data.update(
                dict(
                    name=run.name,
                    comment=run.comment,
                    computer=computer_uuid,
                    python_file=run.python_file,
                    repo_remotes=run.repo_remotes,
                    commit=run.commit,
                    commit_message=run.commit_message,
                    is_dirty=run.is_dirty,
                    start_step=run.start_step,
                    load_run=run.load_run,
                    tags=run.tags,
                    notes=run.notes,
                ))

        self.api_caller.has_data(self)

        from labml.internal.api.logs import API_LOGS
        API_LOGS.set_api(self.api_caller, frequency=LOGS_FREQUENCY)
Esempio n. 10
0
    def set_token(self):
        from labml.internal.computer.configs import computer_singleton

        if not computer_singleton().web_api.is_default:
            return True

        while True:
            token = input(
                'Enter app.labml.ai token (Go to Settings after logging into app.labml.ai):'
            )

            if len(token) != 32:
                logger.log("Invalid token", Text.danger)

            break

        computer_singleton().set_token(token)
Esempio n. 11
0
 def __init__(self, session_uuid: str):
     api_caller = ApiCaller(computer_singleton().web_api.url,
                            {'computer_uuid': computer_singleton().uuid, 'session_uuid': session_uuid},
                            timeout_seconds=15,
                            daemon=True)
     self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency)
     self.header = Header(api_caller,
                          frequency=computer_singleton().web_api.frequency,
                          open_browser=computer_singleton().web_api.open_browser)
     self.data = {}
     self.cache = {}
     self.nvml = None
     self.n_gpu = 0
     try:
         from py3nvml import py3nvml as nvml
         self.nvml = nvml
     except ImportError:
         labml_notice('Install py3nvml to monitor GPUs:\n pip install py3nvml',
                      is_warn=False)
Esempio n. 12
0
    def __init__(self, path: Path):
        self.uuid = str(path.stem)
        self.path = path
        from labml.internal.computer.configs import computer_singleton
        self.cache_path = computer_singleton().runs_cache / self.uuid

        self.complete = False
        self.size = 0
        self.size_tensorboard = 0
        self.size_checkpoints = 0

        self.load_cache()
        if not self.complete:
            self.scan()
Esempio n. 13
0
def get_running_process():
    pid_file = computer_singleton().config_folder / 'monitor.pid'
    if not pid_file.exists():
        return 0

    with open(str(pid_file), 'r') as f:
        pid = f.read()
        try:
            pid = int(pid)
        except ValueError:
            return 0

        if is_pid_running(pid):
            return pid
        else:
            return 0
Esempio n. 14
0
def _test():
    from labml.internal.computer.configs import computer_singleton
    from labml import lab
    import time

    tb = TensorBoardStarter(computer_singleton().tensorboard_symlink_dir)

    # for k, v in os.environ.items():
    #     print(k, v)

    res = tb.start([
        lab.get_path() / 'logs' / 'sample' / '9f7970d6a98611ebbc6bacde48001122',
    ])

    print(res)

    time.sleep(100)
Esempio n. 15
0
def _test():
    from labml.internal.computer.configs import computer_singleton
    from labml import lab
    from labml.internal.lab import lab_singleton
    import time

    lab_singleton().set_path(
        str(Path(os.path.abspath(__file__)).parent.parent.parent.parent))

    tb = TensorBoardStarter(computer_singleton().tensorboard_symlink_dir)

    # for k, v in os.environ.items():
    #     print(k, v)

    res = tb.start([
        lab.get_path() / 'logs' / 'sample' /
        '68233e98cb5311eb9aa38d17b08f3a1d',
    ])

    print(res)

    time.sleep(100)
Esempio n. 16
0
from typing import List

from labml.internal.computer.configs import computer_singleton
from labml.internal.computer.projects.sync import SyncRuns
from labml.internal.manage import runs as manage_runs
from labml.internal.manage.tensorboard import TensorBoardStarter

SYNC_RUNS = SyncRuns()
TENSORBOARD_STARTER = TensorBoardStarter(
    computer_singleton().tensorboard_symlink_dir,
    computer_singleton().tensorboard_port,
    computer_singleton().tensorboard_visible_port,
    computer_singleton().tensorboard_protocol,
    computer_singleton().tensorboard_host,
)


def start_tensorboard(*, runs: List[str]):
    paths = [r.path for r in SYNC_RUNS.get_runs(runs)]
    ret, msg = TENSORBOARD_STARTER.start(paths)
    if ret:
        return 'success', {
            'url': TENSORBOARD_STARTER.url,
            'message': msg,
        }
    else:
        return 'fail', {
            'message': msg,
        }

Esempio n. 17
0
def _monitor():
    from labml.internal.computer import process
    from labml.internal.computer.configs import computer_singleton

    process.run(True, computer_singleton().web_api.open_browser)
Esempio n. 18
0
 def __init__(self):
     from labml.internal.computer.configs import computer_singleton
     self.home = computer_singleton().home
     self.service_path = computer_singleton(
     ).home / '.config' / 'systemd' / 'user' / 'labml.service'
Esempio n. 19
0
 def load(self):
     from labml.internal.computer.configs import computer_singleton
     self.projects = [Project(Path(p)) for p in computer_singleton().get_projects()]