Example #1
0
 def fn(hvd, tf, keras):
     gpus = tf.config.experimental.list_physical_devices('GPU')
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
     if gpus:
         tf.config.experimental.set_visible_devices(
             gpus[_get_assigned_gpu_or_default(default=hvd.local_rank())], 'GPU')
Example #2
0
 def fn(hvd, tf, keras):
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     config.gpu_options.visible_device_list = \
         str(_get_assigned_gpu_or_default(default=hvd.local_rank()))
     keras.backend.set_session(tf.Session(config=config))
Example #3
0
    def train(serialized_model, optimizer_cls, model_opt_state_serialized,
              train_rows, val_rows, avg_row_size):
        from petastorm import TransformSpec, make_reader, make_batch_reader
        from petastorm.pytorch import BatchedDataLoader, InMemBatchedDataLoader
        import torch
        import horovod.torch as hvd

        # Deserializing objects
        model_opt_state = torch.load(model_opt_state_serialized)
        model = deserialize(serialized_model)

        if loss_fns_pre_train:
            loss_fns = loss_fns_pre_train
        if loss_constructors:
            local_vars = locals()
            loss_fns = [
                loss_constructor(**local_vars)
                for loss_constructor in loss_constructors
            ]

        # Horovod: initialize library.
        hvd.init()

        if not user_shuffle_buffer_size:
            shuffle_buffer_size = \
                calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size())
        else:
            shuffle_buffer_size = user_shuffle_buffer_size

        cuda_available = torch.cuda.is_available()
        if cuda_available:
            # Horovod: pin GPU to local rank or the assigned GPU from spark.
            torch.cuda.set_device(
                _get_assigned_gpu_or_default(default=hvd.local_rank()))
            # Move model to GPU.
            model.cuda()

        # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of
        # objects as their identity and therefore it cannot be serialized and then
        # deserialized. The deserialized optimizer object stores the names of the parameters
        # with their old memory addresses but in reality those are different than the
        # reconstructed deserialized object and that creates problem.
        # Learning rate is a required parameters in SGD optimizer. It will be overridden with
        # load_state_dict.
        optimizer = optimizer_cls(model.parameters(), lr=1)
        optimizer_state = model_opt_state['optimizer']

        if last_checkpoint_state is not None:
            model.load_state_dict(last_checkpoint_state['model'])
            optimizer.load_state_dict(last_checkpoint_state['optimizer'])
        else:
            # scale the learning rate with the number of horovod workers
            for i in range(len(optimizer_state['param_groups'])):
                optimizer_state['param_groups'][i]['lr'] = \
                    optimizer_state['param_groups'][i]['lr'] * hvd.size()

            optimizer.load_state_dict(optimizer_state)

        # Horovod: broadcast parameters & optimizer state.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)

        for group in optimizer.param_groups:
            for p in group['params']:
                if id(p) not in optimizer.state_dict()['state']:
                    p.grad = p.data.new(p.size()).zero_()
        optimizer.step()
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        dist_optimizer_args = dict(optimizer=optimizer,
                                   named_parameters=model.named_parameters())
        if gradient_compression:
            # Pass the compression arg only if it is specified by the user.
            dist_optimizer_args['compression'] = gradient_compression
        # Horovod: wrap optimizer with DistributedOptimizer.
        optimizer = hvd.DistributedOptimizer(**dist_optimizer_args)

        # This function takes the current optimizer and constructs a new optimizer with the
        # same state except with learning rate scaled down with the number of horovod workers.
        # This is important the retraining of the model. User may retrain the model with
        # different number of workers and we need the raw learning rate to adjust with the
        # new number of workers.

        transform_spec = None
        if transformation:
            transform_spec = TransformSpec(transformation)

        schema_fields = feature_columns + label_columns
        if sample_weight_col:
            schema_fields.append(sample_weight_col)

        if train_steps_per_epoch is None:
            steps_per_epoch = int(
                math.floor(float(train_rows) / batch_size / hvd.size()))
        else:
            steps_per_epoch = train_steps_per_epoch

        with remote_store.get_local_output_dir() as run_output_dir:
            logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir)
            log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None
            ckpt_file = os.path.join(run_output_dir,
                                     remote_store.checkpoint_filename)

            def save_checkpoint():
                model.cpu()
                optimizer_with_scaled_down_lr = \
                    get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model)
                state = {
                    'model': model.state_dict(),
                    'optimizer': optimizer_with_scaled_down_lr.state_dict(),
                }
                torch.save(state, ckpt_file)
                if cuda_available:
                    model.cuda()

            # In general, make_batch_reader is faster than make_reader for reading the dataset.
            # However, we found out that make_reader performs data transformations much faster than
            # make_batch_reader with parallel worker processes. Therefore, the default reader
            # we choose is make_batch_reader unless there are data transformations.
            reader_factory = None
            reader_factory_kwargs = dict()
            if transform_spec:
                reader_factory = make_reader
                reader_factory_kwargs['pyarrow_serialize'] = True
            else:
                reader_factory = make_batch_reader

            # Petastorm: read data from the store with the correct shard for this rank
            # setting num_epochs=None will cause an infinite iterator
            # and enables ranks to perform training and validation with
            # unequal number of samples
            with reader_factory(remote_store.train_data_path,
                                num_epochs=None,
                                cur_shard=hvd.rank(),
                                reader_pool_type=reader_pool_type,
                                workers_count=train_reader_worker_count,
                                shard_count=hvd.size(),
                                hdfs_driver=PETASTORM_HDFS_DRIVER,
                                schema_fields=schema_fields,
                                transform_spec=transform_spec,
                                **reader_factory_kwargs) as train_reader:
                with reader_factory(remote_store.val_data_path,
                                    num_epochs=None,
                                    cur_shard=hvd.rank(),
                                    reader_pool_type=reader_pool_type,
                                    workers_count=val_reader_worker_count,
                                    shard_count=hvd.size(),
                                    hdfs_driver=PETASTORM_HDFS_DRIVER,
                                    schema_fields=schema_fields,
                                    transform_spec=transform_spec,
                                    **reader_factory_kwargs) \
                    if should_validate else empty_batch_reader() as val_reader:

                    if inmemory_cache_all:
                        # Petastorm introduced InMemBatchedDataLoader class in v0.11.0
                        train_loader = InMemBatchedDataLoader(
                            train_reader,
                            batch_size=batch_size,
                            num_epochs=epochs,
                            rows_capacity=steps_per_epoch * batch_size,
                            shuffle=True)
                    else:
                        train_loader = BatchedDataLoader(
                            train_reader,
                            batch_size=batch_size,
                            shuffling_queue_capacity=shuffle_buffer_size)
                    train_loader_iter = iter(train_loader)

                    def prepare_batch(row):
                        inputs = [
                            prepare_np_data(row[col].float(), col,
                                            metadata).reshape(shape) for col,
                            shape in zip(feature_columns, input_shapes)
                        ]
                        labels = [
                            prepare_np_data(row[col].float(), col, metadata)
                            for col in label_columns
                        ]

                        sample_weights = row.get(sample_weight_col, None)
                        if sample_weights is not None:
                            sample_weights = sample_weights.float()
                        if cuda_available:
                            inputs = [input.cuda() for input in inputs]
                            labels = [label.cuda() for label in labels]
                            if sample_weights is not None:
                                sample_weights = sample_weights.cuda()
                        return inputs, labels, sample_weights

                    def transform_outputs(outputs, labels):
                        if not isinstance(outputs, tuple) and not isinstance(
                                outputs, list):
                            outputs = [outputs]

                        # reshape labels to match the output shape of the model
                        if hasattr(outputs[0], 'shape'):
                            if label_shapes:
                                labels = [
                                    label.reshape(label_shape)
                                    for label, label_shape in zip(
                                        labels, label_shapes)
                                ]
                            else:
                                # If label_shapes parameter is not provided, reshape the label
                                # columns data to match the shape of the model output
                                labels = [
                                    label.reshape(output.shape)
                                    if output.shape.numel()
                                    == label.shape.numel() else label
                                    for label, output in zip(labels, outputs)
                                ]

                        return outputs, labels

                    def aggregate_metrics(stage, epoch, loss,
                                          metric_value_groups):
                        all_metric_groups_values = get_metric_avgs(
                            metric_value_groups)
                        if remote_store.saving_runs:
                            write_metrics_summary(stage, epoch, loss,
                                                  all_metric_groups_values,
                                                  log_writer)
                        return {
                            loss.name: loss.avg.item(),
                            'all_metrics': all_metric_groups_values
                        }

                    def loss_fn(outputs, labels, sample_weights):
                        loss = calculate_loss(outputs, labels, loss_weights,
                                              loss_fns, sample_weights)
                        return loss

                    def print_metrics(batch_idx, loss, metric_value_groups,
                                      phase):
                        if user_verbose > 0 and hvd.rank() == 0 and \
                                batch_idx % METRIC_PRINT_FREQUENCY == 0:
                            print(
                                "{phase}\tepoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}"
                                .format(phase=phase,
                                        epoch=epoch,
                                        batch_idx=batch_idx,
                                        metrics=aggregate_metrics(
                                            phase, epoch, loss,
                                            metric_value_groups)))

                    def _train(epoch):
                        model.train()
                        train_loss = metric_cls('loss', hvd)
                        metric_value_groups = construct_metric_value_holders(
                            metric_cls, metric_fn_groups, label_columns, hvd)

                        # iterate on one epoch
                        for batch_idx in range(steps_per_epoch):
                            row = next(train_loader_iter)
                            inputs, labels, sample_weights = prepare_batch(row)
                            outputs, loss = train_minibatch(
                                model, optimizer, transform_outputs, loss_fn,
                                inputs, labels, sample_weights)
                            update_metrics(metric_value_groups, outputs,
                                           labels)
                            train_loss.update(loss)
                            print_metrics(batch_idx, train_loss,
                                          metric_value_groups, 'train')

                        return aggregate_metrics('train', epoch, train_loss,
                                                 metric_value_groups)

                    if should_validate:
                        if validation_steps_per_epoch is None:
                            validation_steps = int(
                                math.ceil(
                                    float(val_rows) / val_batch_size /
                                    hvd.size()))
                        else:
                            validation_steps = validation_steps_per_epoch

                        if inmemory_cache_all:
                            # Petastorm introduced InMemBatchedDataLoader class in v0.11.0
                            val_loader = InMemBatchedDataLoader(
                                val_reader,
                                batch_size=val_batch_size,
                                num_epochs=epochs,
                                rows_capacity=validation_steps *
                                val_batch_size,
                                shuffle=False)
                        else:
                            val_loader = BatchedDataLoader(
                                val_reader,
                                batch_size=val_batch_size,
                                shuffling_queue_capacity=0)
                        val_loader_iter = iter(val_loader)

                        def _validate(epoch):
                            model.eval()
                            val_loss = metric_cls('loss', hvd)

                            metric_value_groups = construct_metric_value_holders(
                                metric_cls, metric_fn_groups, label_columns,
                                hvd)

                            # iterate on one epoch
                            for batch_idx in range(validation_steps):
                                row = next(val_loader_iter)
                                inputs, labels, sample_weights = prepare_batch(
                                    row)

                                outputs = model(*inputs)
                                outputs, labels = transform_outputs(
                                    outputs, labels)

                                loss = calculate_loss(outputs, labels,
                                                      loss_weights, loss_fns,
                                                      sample_weights)
                                val_loss.update(loss)
                                update_metrics(metric_value_groups, outputs,
                                               labels)
                                print_metrics(batch_idx, val_loss,
                                              metric_value_groups, 'val')
                            return aggregate_metrics('val', epoch, val_loss,
                                                     metric_value_groups)

                    history = []
                    for epoch in range(epochs):
                        epoch_metrics = {
                            'epoch': epoch,
                            'train': _train(epoch)
                        }

                        if should_validate:
                            epoch_metrics['validation'] = _validate(epoch)

                        if user_verbose > 0:
                            pdt_dt = datetime.now(timezone.utc)
                            pdt_time_str = pdt_dt.strftime(
                                "%Y-%b-%d %H:%M:%S UTC")
                            print(pdt_time_str, epoch_metrics)

                        history.append(epoch_metrics)
                        if hvd.rank() == 0:
                            # Save model after every epoch
                            save_checkpoint()
                            if remote_store.saving_runs:
                                remote_store.sync(run_output_dir)

            if hvd.rank() == 0:
                best_checkpoint = torch.load(ckpt_file)
                serialized_checkpoint = io.BytesIO()
                torch.save(best_checkpoint, serialized_checkpoint)
                serialized_checkpoint.seek(0)
                return history, serialized_checkpoint
Example #4
0
    def train(serialized_model):
        import horovod.torch as hvd

        if random_seed is not None:
            pl.utilities.seed.seed_everything(seed=random_seed)

        # Horovod: initialize library.
        hvd.init()

        if verbose:
            import horovod as _horovod
            print(
                f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}"
            )

        _checkpoint_callback = None
        require_checkpoint = False

        with remote_store.get_local_output_dir() as run_output_dir:
            logs_path = os.path.join(run_output_dir, remote_store.logs_subdir)
            os.makedirs(logs_path, exist_ok=True)
            print(f"Made directory {logs_path} for horovod rank {hvd.rank()}")
            ckpt_dir = run_output_dir
            ckpt_filename = remote_store.checkpoint_filename

            if logger is None:
                # Use default logger if no logger is supplied
                train_logger = TensorBoardLogger(logs_path)
                print(f"Setup logger: Using TensorBoardLogger: {train_logger}")

            elif isinstance(logger, CometLogger):
                if logger._experiment_key:
                    # use logger passed in.
                    train_logger = logger
                    train_logger._save_dir = logs_path
                    print(
                        f"Setup logger: change save_dir of the logger to {logs_path}"
                    )

                elif logger_experiment_key:
                    # Resume logger experiment with new log path if key passed correctly from CPU.
                    train_logger = CometLogger(
                        save_dir=logs_path,
                        api_key=logger.api_key,
                        experiment_key=logger_experiment_key,
                    )

                    print(
                        f"Setup logger: Resume comet logger: {vars(train_logger)}"
                    )

                else:
                    print(
                        f"Failed to setup or resume comet logger. origin logger: {vars(logger)}"
                    )

            else:
                # use logger passed in.
                train_logger = logger
                train_logger.save_dir = logs_path
                print(
                    f"Setup logger: Using logger passed from estimator: {train_logger}"
                )

            # Lightning requires to add checkpoint callbacks for all ranks.
            # Otherwise we are seeing hanging in training.
            for cb in callbacks:
                if isinstance(cb, ModelCheckpoint):
                    cb.dirpath = ckpt_dir
                    cb.filename = ckpt_filename
                    _checkpoint_callback = cb
                    require_checkpoint = True
                    break
            if not _checkpoint_callback:
                # By default 'monitor'=None which saves a checkpoint only for the last epoch.
                _checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir,
                                                       filename=ckpt_filename,
                                                       verbose=True)
                callbacks.append(_checkpoint_callback)

            if remote_store.saving_runs and hvd.rank() == 0:
                # Horovod: sync checkpoint and logging files only on rank 0 to
                # prevent other ranks from corrupting them.
                class _SyncCallback(Callback):
                    def on_epoch_end(self, trainer: "pl.Trainer",
                                     pl_module: "pl.LightningModule") -> None:
                        remote_store.sync(run_output_dir)

                callbacks.append(_SyncCallback())

            model = deserialize(serialized_model)

            _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \
                int(math.floor(float(train_rows) / batch_size / hvd.size()))

            _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \
                int(math.floor(float(val_rows) / val_batch_size / hvd.size()))

            shuffle_size = calculate_shuffle_buffer_size()
            if verbose:
                print(
                    f"Training data of rank[{hvd.local_rank()}]: Epochs: {epochs}, "
                    f"Shuffle_size: {shuffle_size}, Random seed: {random_seed}\n"
                    f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {_train_steps_per_epoch}\n"
                    f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n"
                    f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n"
                )

            cuda_available = torch.cuda.is_available()
            # We need to check all ranks have same device type for traning.
            # Horovod doesn't support heterogeneous allreduce for gradients.
            cuda_avail_list = hvd.allgather_object(cuda_available,
                                                   name='device type')
            if cuda_avail_list.count(cuda_available) != hvd.size():
                raise RuntimeError("All ranks don't have same device type!")

            if cuda_available:
                # Horovod: pin GPU to local rank or the assigned GPU from spark.
                torch.cuda.set_device(
                    _get_assigned_gpu_or_default(default=hvd.local_rank()))
                # Move model to GPU.
                model.cuda()

            _num_gpus = num_gpus
            if _num_gpus is None:
                _num_gpus = 1 if cuda_available else 0

            # Set bar refresh to 1 / epoch, detailed loss and metrics is avaialbe in logger,
            # no need to print in screen here. User can still override this in trainer_args
            progress_bar_refresh_rate = _train_steps_per_epoch

            kwargs = {
                'accelerator': 'horovod',
                'gpus': _num_gpus,
                'callbacks': callbacks,
                'max_epochs': epochs,
                'logger': train_logger,
                'log_every_n_steps': log_every_n_steps,
                'num_sanity_val_steps': 0,
                'reload_dataloaders_every_epoch': False,
                'progress_bar_refresh_rate': progress_bar_refresh_rate,
                'terminate_on_nan': terminate_on_nan,
                'profiler': profiler
            }
            if trainer_args:
                kwargs.update(trainer_args)

            if verbose and hvd.rank() == 0:
                print("Creating trainer with: \n ", kwargs)

            trainer = Trainer(**kwargs)

            if profiler != 'simple' and trainer.profiler:
                print(
                    f"Set profiler's logs_path for {hvd.rank()} to {logs_path}"
                )
                trainer.profiler.dirpath = logs_path
                # filename where the profiler results will be saved instead of
                # printing to stdout. The .txt extension will be used automatically.
                trainer.profiler.filename = "profile"

            if verbose and hvd.rank() == 0:
                print(f"pytorch_lightning version={pl.__version__}")

            data_module_kwargs = {
                'train_dir':
                remote_store.train_data_path,
                'val_dir':
                remote_store.val_data_path,
                'num_train_epochs':
                epochs,
                'has_val':
                should_validate is not None,
                'train_batch_size':
                batch_size,
                'val_batch_size':
                val_batch_size,
                'shuffle_size':
                shuffle_size,
                'num_reader_epochs':
                loader_num_epochs,
                'reader_pool_type':
                reader_pool_type,
                'reader_worker_count':
                train_reader_worker_count,
                'transform_spec':
                transformation,
                'inmemory_cache_all':
                inmemory_cache_all,
                'cur_shard':
                hvd.rank(),
                'shard_count':
                hvd.size(),
                'schema_fields':
                schema_fields,
                'storage_options':
                storage_options,
                'steps_per_epoch_train':
                _train_steps_per_epoch,
                'steps_per_epoch_val':
                _val_steps_per_epoch,
                'verbose':
                verbose,
                'debug_data_loader':
                debug_data_loader,
                'train_async_data_loader_queue_size':
                train_async_data_loader_queue_size,
                'val_async_data_loader_queue_size':
                val_async_data_loader_queue_size,
            }
            if debug_data_loader and hvd.rank() == 0:
                print(
                    f"Creating data module with args:\n {data_module_kwargs}")

            dataset = data_module(**data_module_kwargs)

            trainer.fit(model, dataset)

            if hvd.rank() == 0:
                if remote_store.saving_runs and trainer.profiler:
                    # One more file sync to push profiler result.
                    remote_store.sync(logs_path)

                # rank 0 overwrites model with best checkpoint and returns.
                if require_checkpoint:
                    if verbose:
                        print("load from checkpoint best model path:",
                              _checkpoint_callback.best_model_path)
                    best_model = model.load_from_checkpoint(
                        _checkpoint_callback.best_model_path)
                else:
                    best_model = model
                serialized_checkpoint = io.BytesIO()
                module = best_model if not is_legacy else best_model._model

                output = {
                    'model': module.state_dict(),
                    'logged_metrics': trainer.logged_metrics
                }

                torch.save(output, serialized_checkpoint)

                return serialized_checkpoint
Example #5
0
    def train(serialized_model):
        import horovod.torch as hvd
        # Horovod: initialize library.
        hvd.init()

        with tempfile.TemporaryDirectory(
        ) as last_ckpt_dir, remote_store.get_local_output_dir(
        ) as run_output_dir:
            last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt')
            if ckpt_bytes:
                with open(last_ckpt_file, 'wb') as f:
                    f.write(ckpt_bytes)

            # TODO: Pass the logger from estimator constructor
            logs_path = os.path.join(run_output_dir, remote_store.logs_subdir)

            # Use default logger if no logger is supplied
            train_logger = logger
            if train_logger is None:
                train_logger = TensorBoardLogger(logs_path)

            # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config
            # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename)
            # os.makedirs(ckpt_path, exist_ok=True)
            # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path)
            # callbacks.append(model_checkpoint_callback)

            is_model_checkpoint_callback_exist = False
            if callbacks is not None:
                for cb in callbacks:
                    if isinstance(cb, ModelCheckpoint):
                        is_model_checkpoint_callback_exist = True
                        break

            model = deserialize(serialized_model)

            _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \
                int(math.floor(float(train_rows) / batch_size / hvd.size()))

            _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \
                int(math.floor(float(val_rows) / val_batch_size / hvd.size()))

            print(
                f"Training data of rank[{hvd.local_rank()}]: train_rows:{train_rows}, batch_size:{batch_size}, _train_steps_per_epoch:{_train_steps_per_epoch}."
            )
            print(
                f"Validation data of rank[{hvd.local_rank()}]: val_rows:{val_rows}, val_batch_size:{val_batch_size}, _val_steps_per_epoch:{_val_steps_per_epoch}, should_validate:{should_validate}"
            )

            cuda_available = torch.cuda.is_available()
            # We need to check all ranks have same device type for traning.
            # Horovod doesn't support heterogeneous allreduce for gradients.
            cuda_avail_list = hvd.allgather_object(cuda_available,
                                                   name='device type')
            if cuda_avail_list.count(cuda_available) != hvd.size():
                raise RuntimeError("All ranks don't have same device type!")

            if cuda_available:
                # Horovod: pin GPU to local rank or the assigned GPU from spark.
                torch.cuda.set_device(
                    _get_assigned_gpu_or_default(default=hvd.local_rank()))
                # Move model to GPU.
                model.cuda()

            _num_gpus = num_gpus
            if _num_gpus is None:
                _num_gpus = 1 if cuda_available else 0

            kwargs = {
                'accelerator': 'horovod',
                'gpus': _num_gpus,
                'callbacks': callbacks,
                'max_epochs': epochs,
                'logger': train_logger,
                'log_every_n_steps': log_every_n_steps,
                'resume_from_checkpoint':
                (last_ckpt_file if ckpt_bytes else None),
                'checkpoint_callback': is_model_checkpoint_callback_exist,
                'num_sanity_val_steps': 0,
                'reload_dataloaders_every_epoch': False,
                'progress_bar_refresh_rate': _train_steps_per_epoch // 10
            }
            print("Creating trainer with: \n ", kwargs)
            trainer = Trainer(**kwargs)

            print(f"pytorch_lightning version={pl.__version__}")

            # print row group
            # pq.ParquetFile(remote_store.train_data_path)
            # for rowgroup in range(pq_file.metadata.num_row_groups):
            #     row_group = pq_file.metadata.row_group(rowgroup)
            #     print(row_group)

            with set_data_loader(model, remote_store.train_data_path, 'train_dataloader',
                                 train_reader_worker_count, reader_pool_type, calculate_shuffle_buffer_size(),
                                 name="train_dataloader",
                                 limit_step_per_epoch=_train_steps_per_epoch), \
                    set_data_loader(model, remote_store.val_data_path, 'val_dataloader',
                                    val_reader_worker_count, reader_pool_type, 0,
                                    should_validate, name="val_dataloader",
                                    limit_step_per_epoch=_val_steps_per_epoch):

                trainer.fit(model)

            serialized_checkpoint = io.BytesIO()
            module = model if not is_legacy else model._model

            # TODO: find a way to pass trainer.logged_metrics out.
            output = {'model': module.state_dict()}

            torch.save(output, serialized_checkpoint)
            serialized_checkpoint.seek(0)
            return serialized_checkpoint
Example #6
0
    def train(serialized_model):
        import horovod.torch as hvd
        # Horovod: initialize library.
        hvd.init()

        with tempfile.TemporaryDirectory(
        ) as last_ckpt_dir, remote_store.get_local_output_dir(
        ) as run_output_dir:
            last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt')
            if ckpt_bytes:
                with open(last_ckpt_file, 'wb') as f:
                    f.write(ckpt_bytes)

            # TODO: Pass the logger from estimator constructor
            logs_path = os.path.join(run_output_dir, remote_store.logs_subdir)
            logger = TensorBoardLogger(logs_path)

            # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config
            # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename)
            # os.makedirs(ckpt_path, exist_ok=True)
            # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path)
            # callbacks.append(model_checkpoint_callback)

            is_model_checkpoint_callback_exist = False
            if callbacks is not None:
                for cb in callbacks:
                    if isinstance(cb, ModelCheckpoint):
                        is_model_checkpoint_callback_exist = True
                        break

            model = deserialize(serialized_model)

            _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else 1.0
            _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else 1.0

            cuda_available = torch.cuda.is_available()
            if cuda_available:
                # Horovod: pin GPU to local rank or the assigned GPU from spark.
                torch.cuda.set_device(
                    _get_assigned_gpu_or_default(default=hvd.local_rank()))
                # Move model to GPU.
                model.cuda()

            _num_gpus = num_gpus
            if _num_gpus is None:
                _num_gpus = 1 if cuda_available else 0

            kwargs = {
                'accelerator': 'horovod',
                'gpus': _num_gpus,
                'callbacks': callbacks,
                'max_epochs': epochs,
                'limit_train_batches': _train_steps_per_epoch,
                'limit_val_batches': _val_steps_per_epoch,
                'logger': logger,
                'resume_from_checkpoint':
                (last_ckpt_file if ckpt_bytes else None),
                'checkpoint_callback': is_model_checkpoint_callback_exist,
                'num_sanity_val_steps': 0,
                'reload_dataloaders_every_epoch': False
            }
            print("Creating trainer with: \n ", kwargs)
            trainer = Trainer(**kwargs)

            print(f"pytorch_lightning version={pl.__version__}")

            # print row group
            # pq.ParquetFile(remote_store.train_data_path)
            # for rowgroup in range(pq_file.metadata.num_row_groups):
            #     row_group = pq_file.metadata.row_group(rowgroup)
            #     print(row_group)

            with make_petastorm_reader(model, remote_store.train_data_path, 'train_dataloader',
                                       train_reader_worker_count, reader_pool_type), \
                    make_petastorm_reader(model, remote_store.val_data_path, 'val_dataloader',
                                          val_reader_worker_count, reader_pool_type, should_validate):

                trainer.fit(model)

            serialized_checkpoint = io.BytesIO()
            module = model if not is_legacy else model._model

            # TODO: find a way to pass trainer.logged_metrics out.
            output = {'model': module.state_dict()}

            torch.save(output, serialized_checkpoint)
            serialized_checkpoint.seek(0)
            return serialized_checkpoint
Example #7
0
    def train(serialized_model):
        import horovod.torch as hvd
        # Horovod: initialize library.
        hvd.init()

        with tempfile.TemporaryDirectory(
        ) as last_ckpt_dir, remote_store.get_local_output_dir(
        ) as run_output_dir:
            last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt')
            if ckpt_bytes:
                with open(last_ckpt_file, 'wb') as f:
                    f.write(ckpt_bytes)

            # TODO: Pass the logger from estimator constructor
            logs_path = os.path.join(run_output_dir, remote_store.logs_subdir)

            # Use default logger if no logger is supplied
            train_logger = logger

            if train_logger is None:
                train_logger = TensorBoardLogger(logs_path)
            elif isinstance(train_logger,
                            CometLogger) and train_logger._save_dir is None:
                # Setting the CometLogger's save_dir allows us to sync checkpoints and profiler output
                train_logger._save_dir = logs_path

            # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config
            # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename)
            # os.makedirs(ckpt_path, exist_ok=True)
            # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path)
            # callbacks.append(model_checkpoint_callback)

            is_model_checkpoint_callback_exist = False
            for cb in callbacks:
                if isinstance(cb, ModelCheckpoint):
                    is_model_checkpoint_callback_exist = True
                    break

            if remote_store.saving_runs and hvd.rank() == 0:

                class _SyncCallback(Callback):
                    def on_epoch_end(self, trainer: "pl.Trainer",
                                     pl_module: "pl.LightningModule") -> None:
                        print("Syncing to remote_store.")
                        remote_store.sync(logs_path)

                callbacks.append(_SyncCallback())

            model = deserialize(serialized_model)

            _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \
                int(math.floor(float(train_rows) / batch_size / hvd.size()))

            _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \
                int(math.floor(float(val_rows) / val_batch_size / hvd.size()))

            print(
                f"Training data of rank[{hvd.local_rank()}]: train_rows:{train_rows}, batch_size:{batch_size}, _train_steps_per_epoch:{_train_steps_per_epoch}."
            )

            cuda_available = torch.cuda.is_available()
            # We need to check all ranks have same device type for traning.
            # Horovod doesn't support heterogeneous allreduce for gradients.
            cuda_avail_list = hvd.allgather_object(cuda_available,
                                                   name='device type')
            if cuda_avail_list.count(cuda_available) != hvd.size():
                raise RuntimeError("All ranks don't have same device type!")

            if cuda_available:
                # Horovod: pin GPU to local rank or the assigned GPU from spark.
                torch.cuda.set_device(
                    _get_assigned_gpu_or_default(default=hvd.local_rank()))
                # Move model to GPU.
                model.cuda()

            _num_gpus = num_gpus
            if _num_gpus is None:
                _num_gpus = 1 if cuda_available else 0

            kwargs = {
                'accelerator': 'horovod',
                'gpus': _num_gpus,
                'callbacks': callbacks,
                'max_epochs': epochs,
                'logger': train_logger,
                'log_every_n_steps': log_every_n_steps,
                'resume_from_checkpoint':
                (last_ckpt_file if ckpt_bytes else None),
                'checkpoint_callback': is_model_checkpoint_callback_exist,
                'num_sanity_val_steps': 0,
                'reload_dataloaders_every_epoch': False,
                'progress_bar_refresh_rate': _train_steps_per_epoch // 10,
                'terminate_on_nan': terminate_on_nan,
                'profiler': estimator.getProfiler()
            }
            print("Creating trainer with: \n ", kwargs)
            trainer = Trainer(**kwargs)

            print(f"pytorch_lightning version={pl.__version__}")

            dataset = data_module(
                train_dir=remote_store.train_data_path,
                val_dir=remote_store.val_data_path,
                num_train_epochs=epochs,
                has_val=should_validate is not None,
                train_batch_size=batch_size,
                val_batch_size=val_batch_size,
                shuffle_size=calculate_shuffle_buffer_size(),
                num_reader_epochs=loader_num_epochs,
                reader_pool_type=reader_pool_type,
                reader_worker_count=train_reader_worker_count,
                transform_spec=transformation,
                inmemory_cache_all=inmemory_cache_all,
                cur_shard=hvd.rank(),
                shard_count=hvd.size(),
                schema_fields=schema_fields,
                storage_options=storage_options,
                steps_per_epoch_train=_train_steps_per_epoch,
                steps_per_epoch_val=_val_steps_per_epoch,
                verbose=verbose)
            trainer.fit(model, dataset)

            serialized_checkpoint = io.BytesIO()
            module = model if not is_legacy else model._model

            # TODO: find a way to pass trainer.logged_metrics out.
            output = {'model': module.state_dict()}

            torch.save(output, serialized_checkpoint)
            serialized_checkpoint.seek(0)
            return serialized_checkpoint