Ejemplo n.º 1
0
 def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
     timestamp_ms = int(time() * 1000)
     for k, v in metrics.items():
         if isinstance(v, str):
             log.warning(f'Discarding metric with string value {k}={v}.')
             continue
         self.experiment.log_metric(self.run_id, k, v, timestamp_ms, step)
Ejemplo n.º 2
0
    def experiment(self) -> MlflowClient:
        r"""
        Actual MLflow object. To use MLflow features in your
        :class:`~pytorch_lightning.core.lightning.LightningModule` do the following.

        Example::

            self.logger.experiment.some_mlflow_function()

        """
        expt = self._mlflow_client.get_experiment_by_name(
            self._experiment_name)

        if expt:
            self._experiment_id = expt.experiment_id
        else:
            log.warning(
                f'Experiment with name {self._experiment_name} not found. Creating it.'
            )
            self._experiment_id = self._mlflow_client.create_experiment(
                name=self._experiment_name)

        if not self._run_id:
            run = self._mlflow_client.create_run(
                experiment_id=self._experiment_id, tags=self.tags)
            self._run_id = run.info.run_id
        return self._mlflow_client
Ejemplo n.º 3
0
    def __init__(self,
                 hparams,
                 collate_fn: Callable = utils.collate_single_fn,
                 **kwargs):
        super().__init__(hparams, collate_fn)

        # instantiate eventual adapters passed from init method
        if hparams.train_filepath is not None:
            if not os.path.isfile(hparams.train_filepath):
                raise ValueError(
                    f"Argument `train_filepath` is not a valid file")
            self.train_filepath = hparams.train_filepath

        if hparams.valid_filepath is not None:
            if not os.path.isfile(hparams.valid_filepath):
                raise ValueError(
                    f"Argument `valid_filepath` is not a valid file")
            self.valid_filepath = hparams.valid_filepath

        if hparams.test_filepath is not None:
            for test_file in hparams.test_filepath:
                if not os.path.isfile(test_file):
                    raise ValueError(
                        f"file `{test_file}` is not a valid test file")
            self.test_filepath = hparams.test_filepath

        for kwarg in kwargs:
            logger.warning(
                f'CompressedDataModule received unused parameter {kwarg}')
Ejemplo n.º 4
0
    def sig_handler(self, signum, frame):  # pragma: no-cover
        # Todo: required argument `signum` is not used
        # Todo: required argument `frame` is not used
        if self.trainer.is_global_zero:
            # save weights
            log.info('handling SIGUSR1')
            self.trainer.checkpoint_connector.hpc_save(
                self.trainer.weights_save_path, self.trainer.logger)

            # find job id
            job_id = os.environ['SLURM_JOB_ID']
            cmd = ['scontrol', 'requeue', job_id]

            # requeue job
            log.info(f'requeing job {job_id}...')
            result = call(cmd)

            # print result text
            if result == 0:
                log.info(f'requeued exp {job_id}')
            else:
                log.warning('requeue failed...')

            # close experiment to avoid issues
            self.trainer.logger.close()
Ejemplo n.º 5
0
def seed_everything(seed: Optional[int] = None) -> int:
    """
    Function that sets seed for pseudo-random number generators in:
    pytorch, numpy, python.random
    In addition, sets the env variable `PL_GLOBAL_SEED` which will be passed to
    spawned subprocesses (e.g. ddp_spawn backend).

    Args:
        seed: the integer value seed for global random state in Lightning.
            If `None`, will read seed from `PL_GLOBAL_SEED` env variable
            or select it randomly.
    """
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    try:
        if seed is None:
            seed = os.environ.get(
                "PL_GLOBAL_SEED",
                _select_seed_randomly(min_seed_value, max_seed_value))
        seed = int(seed)
    except (TypeError, ValueError):
        seed = _select_seed_randomly(min_seed_value, max_seed_value)

    if (seed > max_seed_value) or (seed < min_seed_value):
        log.warning(f"{seed} is not in bounds, \
            numpy accepts from {min_seed_value} to {max_seed_value}")
        seed = _select_seed_randomly(min_seed_value, max_seed_value)

    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed
Ejemplo n.º 6
0
    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
        """Log metrics (numeric values) in TRAINS experiments.
            This method will be called by Trainer.

        Args:
            metrics:
                The dictionary of the metrics.
                If the key contains "/", it will be split by the delimiter,
                then the elements will be logged as "title" and "series" respectively.
            step: Step number at which the metrics should be recorded. Defaults to None.
        """
        if self._bypass or not self._trains:
            return None

        if not step:
            step = self._trains.get_last_iteration()

        for k, v in metrics.items():
            if isinstance(v, str):
                log.warning("Discarding metric with string value {}={}".format(k, v))
                continue
            if isinstance(v, torch.Tensor):
                v = v.item()
            parts = k.split('/')
            if len(parts) <= 1:
                series = title = k
            else:
                title = parts[0]
                series = '/'.join(parts[1:])
            self._trains.get_logger().report_scalar(
                title=title, series=series, value=v, iteration=step)
Ejemplo n.º 7
0
    def sig_handler(self, signum, frame):  # pragma: no-cover
        # Todo: required argument `signum` is not used
        # Todo: required argument `frame` is not used
        if self.trainer.is_global_zero:
            # save weights
            log.info('handling SIGUSR1')
            self.trainer.checkpoint_connector.hpc_save(
                self.trainer.weights_save_path, self.trainer.logger)

            # find job id
            job_id = os.environ['SLURM_JOB_ID']
            cmd = ['scontrol', 'requeue', job_id]

            # requeue job
            log.info(f'requeing job {job_id}...')
            try:
                result = call(cmd)
            except FileNotFoundError:
                # This can occur if a subprocess call to `scontrol` is run outside a shell context
                # Re-attempt call (now with shell context). If any error is raised, propagate to user.
                # When running a shell command, it should be passed as a single string.
                joint_cmd = [str(x) for x in cmd]
                result = call(' '.join(joint_cmd), shell=True)

            # print result text
            if result == 0:
                log.info(f'requeued exp {job_id}')
            else:
                log.warning('requeue failed...')

            # close experiment to avoid issues
            self.trainer.logger.close()
Ejemplo n.º 8
0
def seed_everything(seed: Optional[int] = None) -> int:
    """Function that sets seed for pseudo-random number generators  in:
        pytorch, numpy, python.random and sets PYTHONHASHSEED environment variable.
    """
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    try:
        if seed is None:
            seed = _select_seed_randomly(min_seed_value, max_seed_value)
        else:
            seed = int(seed)
    except (TypeError, ValueError):
        seed = _select_seed_randomly(min_seed_value, max_seed_value)

    if (seed > max_seed_value) or (seed < min_seed_value):
        log.warning(f"{seed} is not in bounds, \
            numpy accepts from {min_seed_value} to {max_seed_value}")
        seed = _select_seed_randomly(min_seed_value, max_seed_value)

    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed
Ejemplo n.º 9
0
    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
        assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0'

        timestamp_ms = int(time() * 1000)
        for k, v in metrics.items():
            if isinstance(v, str):
                log.warning(f'Discarding metric with string value {k}={v}.')
                continue
            self.experiment.log_metric(self.run_id, k, v, timestamp_ms, step)
Ejemplo n.º 10
0
def load_hparams_from_tags_csv(tags_csv: str) -> Namespace:
    if not os.path.isfile(tags_csv):
        log.warning(f'Missing Tags: {tags_csv}.')
        return Namespace()

    with open(tags_csv) as f:
        csv_reader = csv.reader(f, delimiter=',')
        tags = {row[0]: convert(row[1]) for row in list(csv_reader)[1:]}
    ns = Namespace(**tags)
    return ns
Ejemplo n.º 11
0
 def _del_model(self, filepath):
     if gfile.exists(filepath):
         try:
             # in compat mode, remove is not implemented so if running this
             # against an actual remove file system and the correct remote
             # dependencies exist then this will work fine.
             gfile.remove(filepath)
         except AttributeError:
             if is_remote_path(filepath):
                 log.warning(
                     "Unable to remove stale checkpoints due to running gfile in compatibility mode."
                     " Please install tensorflow to run gfile in full mode"
                     " if writing checkpoints to remote locations")
             else:
                 os.remove(filepath)
    def add_model_specific_args(parser):

        parser.add_argument('--learning_rate', type=float, default=1e-4)
        parser.add_argument('--max_sequence_length', type=int, default=128)
        parser.add_argument('--weight_decay', type=float, default=0.0)
        parser.add_argument('--adam_epsilon', type=float, default=1e-8)
        parser.add_argument('--adam_betas', nargs=2, type=float, default=[0.9, 0.999])
        parser.add_argument('--max_grad_norm', type=float, default=1e-8)
        parser.add_argument('--warmup_steps', type=int, default=0)

        tmp_args, _ = parser.parse_known_args()
        if tmp_args.learning_rate > 1:
            logger.warning(f"You specified a huge learning rate! Learning rate: {tmp_args.learning_rate}")

        return parser
Ejemplo n.º 13
0
    def run_id(self):
        if self._run_id is not None:
            return self._run_id

        expt = self._mlflow_client.get_experiment_by_name(self.experiment_name)

        if expt:
            self._expt_id = expt.experiment_id
        else:
            log.warning(f'Experiment with name {self.experiment_name} not found. Creating it.')
            self._expt_id = self._mlflow_client.create_experiment(name=self.experiment_name)

        run = self._mlflow_client.create_run(experiment_id=self._expt_id, tags=self.tags)
        self._run_id = run.info.run_id
        return self._run_id
    def determine_ddp_node_rank(self):
        if self.trainer.is_slurm_managing_tasks:
            return int(os.environ['SLURM_NODEID'])

        # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
        # otherwise use given node rank or default to node rank 0
        env_vars = ['NODE_RANK', 'GROUP_RANK']
        node_ids = [(k, os.environ.get(k, None)) for k in env_vars]
        node_ids = [(k, v) for k, v in node_ids if v is not None]
        if len(node_ids) == 0:
            return 0
        if len(node_ids) > 1:
            log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
        k, rank = node_ids.pop()
        rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
        return int(rank)
Ejemplo n.º 15
0
    def suggestion(self):
        """ This will propose a suggestion for choice of initial learning rate
        as the point with the steepest negative gradient.

        Returns:
            lr: suggested initial learning rate to use

        """
        try:
            min_grad = (np.gradient(np.array(self.results["loss"]))).argmin()
            self._optimal_idx = min_grad
            return self.results["lr"][min_grad]
        except Exception:
            log.warning('Failed to compute suggesting for `lr`.'
                        ' There might not be enough points.')
            self._optimal_idx = None
Ejemplo n.º 16
0
    def _get_next_version(self):
        root_dir = os.path.join(self.save_dir, self.name)

        if not os.path.isdir(root_dir):
            log.warning('Missing logger folder: %s', root_dir)
            return 0

        existing_versions = []
        for d in os.listdir(root_dir):
            if os.path.isdir(os.path.join(root_dir, d)) and d.startswith("version_"):
                existing_versions.append(int(d.split("_")[1]))

        if len(existing_versions) == 0:
            return 0

        return max(existing_versions) + 1
Ejemplo n.º 17
0
    def _get_next_version(self):
        root_dir = os.path.join(self.save_dir, self.name)

        if not self._fs.isdir(root_dir):
            log.warning('Missing logger folder: %s', root_dir)
            return 0

        existing_versions = []
        for d in self._fs.ls(root_dir):
            bn = os.path.basename(d)
            if self._fs.isdir(d) and bn.startswith("version_"):
                dir_ver = bn.split("_")[1].replace('/', '')
                existing_versions.append(int(dir_ver))
        if len(existing_versions) == 0:
            return 0

        return max(existing_versions) + 1
Ejemplo n.º 18
0
    def log_metrics(self,
                    metrics: Dict[str, float],
                    step: Optional[int] = None) -> None:
        assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0'

        timestamp_ms = int(time() * 1000)
        for k, v in metrics.items():
            if isinstance(v, str):
                log.warning(f'Discarding metric with string value {k}={v}.')
                continue

            new_k = re.sub("[^a-zA-Z0-9_/. -]+", "", k)
            if k != new_k:
                warnings.warn((
                    "MLFlow only allows '_', '/', '.' and ' ' special characters in metric name.\n",
                    f"Replacing {k} with {new_k}."))
            k = new_k

            self.experiment.log_metric(self.run_id, k, v, timestamp_ms, step)
Ejemplo n.º 19
0
    def sig_handler(self, signum, frame):  # pragma: no-cover
        if self.proc_rank == 0:
            # save weights
            log.info('handling SIGUSR1')
            self.hpc_save(self.weights_save_path, self.logger)

            # find job id
            job_id = os.environ['SLURM_JOB_ID']
            cmd = 'scontrol requeue {}'.format(job_id)

            # requeue job
            log.info(f'requeing job {job_id}...')
            result = call(cmd, shell=True)

            # print result text
            if result == 0:
                log.info(f'requeued exp {job_id}')
            else:
                log.warning('requeue failed...')

            # close experiment to avoid issues
            self.logger.close()
    def __init__(self, *args, start_from_step=None):
        super().__init__(*args)
        """
        If `start_from_step` is provided, this dataset will return data
        relative to the `start_from_step`+1 effective step. This is not a problem
        since the training algorithm does not know in advance the total dataset length.
        This applied only to the first epoch, from the following all the data are provided.
        Do not provide a `start_from_step` higher than the number of elements in this
        dataset or higher than the total number of max_steps
        """
        self.start_from_step = None
        if start_from_step is not None:
            assert isinstance(start_from_step, int), (
                f"`start_from` must be integer, found {start_from_step}")

            total_devices = utils.get_total_devices(trainer=self.trainer)
            effective_batch_size = self.hparams.batch_size * self.hparams.accumulate_grad_batches * total_devices
            self.start_from_step = effective_batch_size * start_from_step

            logger.warning(
                f"IterableDataset starting from step {start_from_step}. If this is the correct"
                f"behavious, please ignore this warning")
Ejemplo n.º 21
0
    def max_steps_anyway(self) -> int:
        """ Compute total number of steps if not specified. They are required by eventual schedulers. """
        # if already defined, skip
        if self.hparams.max_steps is not None:
            return self.hparams.max_steps

        if not hasattr(self.trainer, 'datamodule'):
            logger.warning(
                "You tried to fix max_steps but didn't provide a datamodule to "
                "the trainer.fit function, returning max_steps=None")
            return None

        # if cannot retrieve len of the dataset, skip
        # this can happen with iterabledatasets
        if not (hasattr(self.trainer.datamodule.train_dataset, '__len__')
                or hasattr(self.trainer.datamodule.train_dataset, 'length')):
            logger.warning(
                "Cannot infer dataset length from datamodule.train_dataset, returning max_steps=None"
            )
            return None

        try:
            dataset_len = len(self.trainer.datamodule.train_dataset)
        except:
            dataset_len = self.trainer.datamodule.train_dataset.length

        if self.trainer.on_gpu:
            total_devices = self.trainer.num_nodes * self.trainer.num_processes
        elif self.trainer.on_tpu:
            total_devices = len(
                self.trainer.tpu_cores) * self.trainer.num_nodes
        elif self.trainer.distributed_backend == 'ddp_cpu':
            total_devices = self.trainer.num_processes
        else:
            total_devices = 1

        num_training_batches = math.ceil(dataset_len / self.hparams.batch_size)
        training_batches_per_epoch = num_training_batches // total_devices
        steps_per_epoch = math.ceil(training_batches_per_epoch /
                                    self.hparams.accumulate_grad_batches)
        steps = self.hparams.max_epochs * steps_per_epoch

        logger.warning(
            f"Automatically computed max_steps={steps}. If it appears to be OK, ignore this warning"
        )

        return steps
Ejemplo n.º 22
0
    def ddp_train(self, process_idx, model):
        """
        Entry point into a DP thread
        :param gpu_idx:
        :param model:
        :param cluster_obj:
        :return:
        """
        # node rank using relative slurm id if under slurm management
        # otherwise use given node rank or default to node rank 0
        try:
            node_id = os.environ[
                'SLURM_NODEID'] if self.is_slurm_managing_tasks else os.environ[
                    'NODE_RANK']
            self.node_rank = int(node_id)
        except KeyError:
            log.warning(
                "SLURM_NODEID or NODE_RANK environment variable is not defined. Set as 0."
            )
            self.node_rank = 0

        # show progressbar only on progress_rank 0
        if (self.node_rank != 0 or
                process_idx != 0) and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # determine which process we are and world size
        if self.use_ddp:
            self.proc_rank = self.node_rank * self.num_processes + process_idx
            self.world_size = self.num_nodes * self.num_processes

        elif self.use_ddp2:
            self.proc_rank = self.node_rank
            self.world_size = self.num_nodes

        # set warning rank
        rank_zero_only.rank = self.proc_rank

        # set up server using proc 0's ip address
        # try to init for 20 times at max in case ports are taken
        # where to store ip_table
        model.trainer = self
        model.init_ddp_connection(self.proc_rank, self.world_size,
                                  self.is_slurm_managing_tasks)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # MODEL
        # copy model to each gpu
        if self.on_gpu:
            self.root_gpu = process_idx
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # set model properties before going into wrapper
        self.copy_trainer_model_properties(model)

        # AMP
        # run through amp wrapper before going to distributed DP
        # TODO: remove in v0.8.0
        if self.use_amp and not self.use_native_amp:
            model, optimizers = model.configure_apex(amp, model,
                                                     self.optimizers,
                                                     self.amp_level)
            self.optimizers = optimizers

        # DDP2 uses all GPUs on the machine
        if self.distributed_backend == 'ddp':
            device_ids = [self.root_gpu]
        elif self.use_ddp2:
            device_ids = self.data_parallel_device_ids
        else:  # includes ddp_cpu
            device_ids = None

        # allow user to configure ddp
        model = model.configure_ddp(model, device_ids)

        # continue training routine
        self.run_pretrain_routine(model)

        # when ddp ends, we save the model
        self.save_spawn_weights(model)
Ejemplo n.º 23
0
def _select_seed_randomly(min_seed_value: int = 0,
                          max_seed_value: int = 255) -> int:
    seed = random.randint(min_seed_value, max_seed_value)
    log.warning(f"No correct seed found, seed set to {seed}")
    return seed
Ejemplo n.º 24
0
    def get_valid_samples_and_source_views_paths(self, img_paths, source_views_indexes):
        """
        Discard the samples that don't have avaible source views, where the source views are defined by source_indexes.

        Parameters
        ----------
        img_paths: list of str
            A list of absolute paths to the data samples.
            Each element of img_paths is of the form:
            path_to_dataset_root_dir/2011_09_26/2011_09_26_drive_0048_sync/image_02/data/0000000085.png

        Returns
        -------
        list of tuple
            A curated list of absolute paths to the data samples ensured to have source views.
            each element of the list is a tuple (target_img_path, [source_view1_img_path, ..., source_viewN_img_path])
        """

        terminal_logger.info(f'Checking if the files given in the split file {self.split_name} '
                             f'exists and have source views...')

        img_paths_and_source_views = []

        for img_path in img_paths:

            # img_path == path_to_dataset_root_dir/2011_09_26/2011_09_26_drive_0048_sync/image_02/data/0000000085.png
            img_path = Path(img_path)
            assert img_path.exists(), img_path

            frame_idx = self.get_frame_idx_from_image_path(img_path) # == 85
            sequence_dir = img_path.parents[2].name # == 2011_09_26_drive_0048_sync

            # checks if any source source view index falls out of the sequence's bounds
            sequence_length = self.sequence_lengths[sequence_dir]
            if frame_idx + source_views_indexes[0] < 0 or frame_idx + source_views_indexes[-1] >= sequence_length:
                # remember that source_views_indexes is in ascending order
                terminal_logger.warning('One or more source views from the ones requested is out of bound.\n'
                                        f'\tIndexes requested: {source_views_indexes} -> '
                                        f'Ignoring frame of idx {frame_idx} from sequence {sequence_dir}.\n'
                                        f'\tThe sequence {sequence_dir} is of length {sequence_length}')
                continue

            # checks if any source source view is missing
            missing_source_view = False
            source_view_paths = self.get_source_view_paths_from_img_path_and_indexes(img_path, source_views_indexes)
            for source_view_index, source_view_img_path in zip(source_views_indexes, source_view_paths):
                if not source_view_img_path.exists():
                    terminal_logger.warning(f'Frame of idx {frame_idx} from sequence {sequence_dir} have the '
                                            f'source view of idx {source_view_index} missing from the ones requested.\n'
                                            f'\tIndexes requested: {source_views_indexes} -> '
                                            f'Ignoring frame of idx {frame_idx} from sequence {sequence_dir}.')
                    missing_source_view = True
                    # not breaking the loop so that we know all the source views that are unavailable

            if not missing_source_view:
                img_paths_and_source_views.append((img_path, source_view_paths))

        terminal_logger.info('Image list curated and source-views registered.')
        terminal_logger.info(f'Original list: {len(img_paths)} files')
        terminal_logger.info(f'Curated list: {len(img_paths_and_source_views)} files')

        return img_paths_and_source_views