Ejemplo n.º 1
0
Archivo: lab.py Proyecto: skiedra/labml
    def __load_configs(self, path: Path):
        if path == self.__current_path:
            return
        self.__current_path = path
        config_files = self.__load_config_files(path)

        if not config_files:
            if not is_colab() and not is_kaggle():
                labml_notice([
                    (".labml.yaml", Text.value),
                    " config file could not be found. Looking in path: ",
                    (str(path), Text.meta)
                ])
                while path.exists() and not path.is_dir():
                    path = path.parent

        for c in config_files:
            self.__merge_configs(c)

        for c in self.custom_configs:
            self.__merge_configs(c)

        if not config_files and self.configs['path'] is None:
            self.configs['path'] = str(path)

        self.__update_configs()
Ejemplo n.º 2
0
    def _send(self, data: List[Dict[str, any]]) -> Dict:
        req = urllib.request.Request(self.url)
        req.add_header('Content-Type', 'application/json; charset=utf-8')
        data_json = json.dumps(data)
        data_json = data_json.encode('utf-8')
        req.add_header('Content-Length', str(len(data_json)))

        # print('Data size', len(data_json))
        response = urllib.request.urlopen(req,
                                          data_json,
                                          timeout=self.timeout_seconds)
        content = response.read().decode('utf-8')
        result = json.loads(content)

        for e in result.get('errors', []):
            if 'error' in e:
                labml_notice([
                    'LabML App Error: ', (e['error'] + '', Text.key), '\n',
                    (e['message'], Text.value)
                ],
                             is_danger=True)
                self.errored = True
                raise RuntimeError('LabML App Error', e)
            elif 'warning' in e:
                labml_notice([
                    'LabML App Warning: ', (e['warning'] + ': ', Text.key),
                    (e['message'], Text.value)
                ])
            else:
                self.errored = True
                raise RuntimeError('Unknown error from LabML App', e)

        return result
Ejemplo n.º 3
0
    def save(self, global_step):
        """
        ## Save model as a set of numpy arrays
        """

        if not self.model_savers:
            if not self.__no_savers_warned:
                labml_notice(["No models were registered for saving\n",
                              "You can register models with ",
                              ('experiment.add_pytorch_models', Text.value)])
                self.__no_savers_warned = True
            return

        checkpoints_path = pathlib.Path(self.path)
        if not checkpoints_path.exists():
            checkpoints_path.mkdir()

        checkpoint_path = checkpoints_path / str(global_step)
        assert not checkpoint_path.exists()

        checkpoint_path.mkdir()

        info = {}
        for name, saver in self.model_savers.items():
            info[name] = saver.save(checkpoint_path)

        # Save header
        with open(str(checkpoint_path / "info.json"), "w") as f:
            f.write(json.dumps(info))
Ejemplo n.º 4
0
    def load_models(self, *,
                    models: List[str],
                    run_uuid: Optional[str] = None,
                    checkpoint: Optional[int] = None):
        if checkpoint is None:
            checkpoint = -1
        checkpoint_path, global_step = experiment_run.get_run_checkpoint(run_uuid, checkpoint)

        if global_step is None:
            labml_notice(['Could not find saved checkpoint'], is_danger=True)
            return

        with monit.section("Loading checkpoint"):
            self.checkpoint_saver.load(checkpoint_path, models)
Ejemplo n.º 5
0
    def _process(self, packets: List[Packet]) -> bool:
        if not packets:
            return True

        data = [p.data for p in packets]

        try:
            response = self._send(data)
        except urllib.error.HTTPError as e:
            labml_notice([
                f'Failed to send to {self.url}: ', (str(e.code), Text.value),
                '\n' + str(e.reason)
            ])
            return False
        except urllib.error.URLError as e:
            labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)])
            return False
        except socket.timeout as e:
            labml_notice([f'{self.url} timeout\n', str(e)])
            return False
        except ConnectionResetError as e:
            labml_notice(
                [f'Connection reset by LabML App server {self.url}\n',
                 str(e)])
            return False

        for h in self.handlers:
            if h.handle(response):
                break

        return True
Ejemplo n.º 6
0
 def __init__(self, session_uuid: str):
     api_caller = ApiCaller(computer_singleton().web_api.url,
                            {'computer_uuid': computer_singleton().uuid, 'session_uuid': session_uuid},
                            timeout_seconds=15,
                            daemon=True)
     self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency)
     self.header = Header(api_caller,
                          frequency=computer_singleton().web_api.frequency,
                          open_browser=computer_singleton().web_api.open_browser)
     self.data = {}
     self.cache = {}
     self.nvml = None
     self.n_gpu = 0
     try:
         from py3nvml import py3nvml as nvml
         self.nvml = nvml
     except ImportError:
         labml_notice('Install py3nvml to monitor GPUs:\n pip install py3nvml',
                      is_warn=False)
Ejemplo n.º 7
0
    def __init__(self):
        self.data = {}
        self.cache = {}
        self.nvml = None
        self.n_gpu = 0

        try:
            from py3nvml import py3nvml as nvml
            self.nvml = nvml
        except ImportError:
            labml_notice(
                'Install py3nvml to monitor GPUs:\n pip install py3nvml',
                is_warn=False)

        if self.nvml:
            try:
                self.nvml.nvmlInit()
                self.nvml.nvmlShutdown()
            except self.nvml.NVMLError:
                logger.log('NVML Library not found', Text.warning)
                self.nvml = None

        self.process_monitor = ProcessMonitor(self.nvml)
Ejemplo n.º 8
0
    def load(self, checkpoint_path: pathlib.Path, models: List[str] = None):
        """
        ## Load model as a set of numpy arrays
        """

        if not self.model_savers:
            if not self.__no_savers_warned:
                labml_notice([
                    "No models were registered for loading or saving\n",
                    "You can register models with ",
                    ('experiment.add_pytorch_models', Text.value)
                ])
                self.__no_savers_warned = True
            return

        if not models:
            models = list(self.model_savers.keys())

        with open(str(checkpoint_path / "info.json"), "r") as f:
            info = json.loads(f.readline())

        to_load = []
        not_loaded = []
        missing = []
        for name in models:
            if name not in info:
                missing.append(name)
            else:
                to_load.append(name)
        for name in info:
            if name not in models:
                not_loaded.append(name)

        # Load each model
        for name in to_load:
            saver = self.model_savers[name]
            saver.load(checkpoint_path, info[name])

        if missing:
            labml_notice([(f'{missing} ', Text.highlight),
                          ('model(s) could not be found.\n'),
                          (f'{to_load} ', Text.none),
                          ('models were loaded.', Text.none)],
                         is_danger=True)
        if not_loaded:
            labml_notice([(f'{not_loaded} ', Text.none),
                          ('models were not loaded.\n', Text.none),
                          'Models to be loaded should be specified with: ',
                          ('experiment.add_pytorch_models', Text.value)])
Ejemplo n.º 9
0
    def send(self, data: Any) -> Any:
        try:
            response = self._send(data)
        except urllib.error.HTTPError as e:
            labml_notice([
                f'Failed to send to {self.url}: ', (str(e.code), Text.value),
                '\n' + str(e.reason)
            ])
            return None
        except urllib.error.URLError as e:
            labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)])
            return None
        except socket.timeout as e:
            labml_notice([f'{self.url} timeout\n', str(e)])
            return None
        except ConnectionResetError as e:
            labml_notice(
                [f'Connection reset by LabML App server {self.url}\n',
                 str(e)])
            return None

        return response
Ejemplo n.º 10
0
    def _process(self, packets: List[Packet]) -> bool:
        callback = None
        data = []
        for p in packets:
            if p.callback is not None:
                if callback is not None:
                    raise RuntimeError('Multiple callbacks')
                callback = p.callback
            data.append(p.data)

        if not data:
            return True

        try:
            callback_url = self._send(data)
        except urllib.error.HTTPError as e:
            labml_notice([
                f'Failed to send to {self.url}: ', (str(e.code), Text.value),
                '\n' + str(e.reason)
            ])
            return False
        except urllib.error.URLError as e:
            labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)])
            return False
        except socket.timeout as e:
            labml_notice([f'{self.url} timeout\n', str(e)])
            return False
        except ConnectionResetError as e:
            labml_notice(
                [f'Connection reset by LabML App server {self.url}\n',
                 str(e)])
            return False

        if callback is not None:
            callback(callback_url)

        return True
Ejemplo n.º 11
0
    def _process(self, packet: Packet) -> bool:
        data = packet.data
        remove = [
            k for k in data
            if k in self.state_attributes and self.key_idx[k] != packet.idx
        ]
        for k in remove:
            del data[k]

        if not data:
            assert packet.callback is None
            return True

        try:
            run_url = self._send(data)
        except urllib.error.HTTPError as e:
            labml_notice([
                f'Failed to send to {self.url}: ', (str(e.code), Text.value),
                '\n' + str(e.reason)
            ])
            return False
        except urllib.error.URLError as e:
            labml_notice([f'Failed to connect to {self.url}\n', str(e.reason)])
            return False
        except socket.timeout as e:
            labml_notice([f'{self.url} timeout\n', str(e)])
            return False
        except ConnectionResetError as e:
            labml_notice(
                [f'Connection reset by LabML App server {self.url}\n',
                 str(e)])
            return False

        if packet.callback is not None:
            packet.callback(run_url)

        return True
Ejemplo n.º 12
0
    def __init__(self, *,
                 uuid: str,
                 name: Optional[str],
                 python_file: Optional[str],
                 comment: Optional[str],
                 writers: Set[str],
                 ignore_callers: Set[str],
                 tags: Optional[Set[str]],
                 is_evaluate: bool):

        if is_ipynb():
            lab_singleton().set_path(os.getcwd())
            if python_file is None:
                python_file = 'notebook.ipynb'
            if name is None:
                name = 'Notebook Experiment'
        else:
            if python_file is None:
                python_file = get_caller_file(ignore_callers)

            lab_singleton().set_path(python_file)

            if name is None:
                file_path = pathlib.PurePath(python_file)
                name = file_path.stem

        if comment is None:
            comment = ''
        if global_params_singleton().comment is not None:
            comment = global_params_singleton().comment

        self.experiment_path = lab_singleton().experiments / name

        self.check_repo_dirty = lab_singleton().check_repo_dirty

        self.configs_processor = None

        if tags is None:
            tags = set(name.split('_'))

        self.run = Run.create(
            uuid=uuid,
            experiment_path=self.experiment_path,
            python_file=python_file,
            trial_time=time.localtime(),
            name=name,
            comment=comment,
            tags=list(tags))

        try:
            repo = git.Repo(lab_singleton().path)

            self.run.repo_remotes = list(repo.remote().urls)
            self.run.commit = repo.head.commit.hexsha
            self.run.commit_message = repo.head.commit.message.strip()
            self.run.is_dirty = repo.is_dirty()
            self.run.diff = repo.git.diff()
        except git.InvalidGitRepositoryError:
            if not is_colab() and not is_kaggle():
                labml_notice(["Not a valid git repository: ",
                              (lab_singleton().path, Text.value)])
            self.run.commit = 'unknown'
            self.run.commit_message = ''
            self.run.is_dirty = True
            self.run.diff = ''

        self.checkpoint_saver = CheckpointSaver(self.run.checkpoint_path)
        self.is_evaluate = is_evaluate
        self.web_api = None
        self.writers = writers
        self.is_started = False
        self.distributed_rank = 0
        self.distributed_world_size = -1