def _fit_on_prepared_data(self, backend, train_rows, val_rows, metadata, avg_row_size, dataset_idx=None): self._check_params(metadata) run_id = self.getRunId() if run_id is None: run_id = 'pytorch_' + str(int(time.time())) last_checkpoint_state = None if self._has_checkpoint(run_id): last_checkpoint_state = self._load_checkpoint(run_id) # Model parameters model_pre_train = self.getModel() model_state = model_pre_train.state_dict() serialized_model = serialize_fn()(model_pre_train) # Optimizer parameters optimizer = self._get_optimizer() optimizer_cls = optimizer.__class__ optimizer_state = optimizer.state_dict() # Combine model and optimizer state model_opt_state = {'model': model_state, 'optimizer': optimizer_state} \ if last_checkpoint_state is None else last_checkpoint_state model_opt_state_serialized = save_into_bio(model_opt_state, torch.save) trainer = remote.RemoteTrainer(self, metadata, last_checkpoint_state, run_id, dataset_idx) handle = backend.run(trainer, args=(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size), env={}) return self._create_model(handle, run_id, metadata)
def _torch_param_serialize(param_name, param_val): if param_val is None: return None if param_name in [EstimatorParams.backend.name, EstimatorParams.store.name]: # We do not serialize backend and store. These params have to be regenerated for each # run of the pipeline return None elif param_name == EstimatorParams.model.name: serialize = serialize_fn() return serialize(param_val) return codec.dumps_base64(param_val)
def _transform(self, df): model_pre_predict = self.getModel() model_pre_predict.eval() deserialize = deserialize_fn() serialize = serialize_fn() serialized_model = serialize(model_pre_predict) input_shapes = self.getInputShapes() label_cols = self.getLabelColumns() output_cols = self.getOutputCols() feature_cols = self.getFeatureColumns() metadata = self._get_metadata() def predict(rows): from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector model = deserialize(serialized_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a # dense torch tensor. data = [ torch.tensor([row[col]]).reshape(shape) for col, shape in zip(feature_cols, input_shapes) ] with torch.no_grad(): preds = model(*data) if not isinstance(preds, list) and not isinstance( preds, tuple): preds = [preds] for label_col, output_col, pred in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for dense and spark tensor is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) elif pred.shape.numel() == 1: # If the column is scalar type, int, float, etc. value = pred.item() python_type = util.spark_scalar_to_python_type( col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) else: field = DenseVector(pred.reshape(-1)) fields[output_col] = field yield Row(**fields) spark0 = SparkSession._instantiatedSession # Get a limited DF and make predictions and get the schema of the final DF limited_pred_rdd = df.limit(100000).rdd.mapPartitions(predict) limited_pred_df = spark0.createDataFrame(limited_pred_rdd, samplingRatio=1) final_output_schema = limited_pred_df.schema # Spark has to infer whether a filed is nullable or not from a limited number of samples. # It does not always get it right. We copy the nullable boolean variable for the fields # from the original dataframe to the final DF schema. nullables = {field.name: field.nullable for field in df.schema.fields} for field in final_output_schema.fields: if field.name in nullables: field.nullable = nullables[field.name] pred_rdd = df.rdd.mapPartitions(predict) # Use the schema from previous section to construct the final DF with prediction return spark0.createDataFrame(pred_rdd, schema=final_output_schema)
def _transform(self, df): import copy from pyspark.sql.types import StructField, StructType from pyspark.ml.linalg import VectorUDT model_pre_predict = self.getModel() deserialize = deserialize_fn() serialize = serialize_fn() serialized_model = serialize(model_pre_predict) input_shapes = self.getInputShapes() label_cols = self.getLabelColumns() output_cols = self.getOutputCols() feature_cols = self.getFeatureColumns() metadata = self._get_metadata() final_output_cols = util.get_output_cols(df.schema, output_cols) def predict(rows): from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector model = deserialize(serialized_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a # dense torch tensor. data = [ torch.tensor([row[col]]).reshape(shape) for col, shape in zip(feature_cols, input_shapes) ] with torch.no_grad(): preds = model(*data) if not isinstance(preds, list) and not isinstance( preds, tuple): preds = [preds] for label_col, output_col, pred in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for dense and spark tensor is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) elif pred.shape.numel() == 1: # If the column is scalar type, int, float, etc. value = pred.item() python_type = util.spark_scalar_to_python_type( col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) else: field = DenseVector(pred.reshape(-1)) fields[output_col] = field values = [fields[col] for col in final_output_cols] yield Row(*values) spark0 = SparkSession._instantiatedSession final_output_fields = [] # copy input schema for field in df.schema.fields: final_output_fields.append(copy.deepcopy(field)) # append output schema override_fields = df.limit(1).rdd.mapPartitions( predict).toDF().schema.fields[-len(output_cols):] for name, override, label in zip(output_cols, override_fields, label_cols): # default data type as label type data_type = metadata[label]['spark_data_type']() if type(override.dataType) == VectorUDT: # Override output to vector. This is mainly for torch's classification loss # where label is a scalar but model output is a vector. data_type = VectorUDT() final_output_fields.append( StructField(name=name, dataType=data_type, nullable=True)) final_output_schema = StructType(final_output_fields) pred_rdd = df.rdd.mapPartitions(predict) # Use the schema from previous section to construct the final DF with prediction return spark0.createDataFrame(pred_rdd, schema=final_output_schema)
def _transform(self, df): model_pre_predict = self.getModel() model_pre_predict.eval() deserialize = deserialize_fn() serialize = serialize_fn() serialized_model = serialize(model_pre_predict) input_shapes = self.getInputShapes() label_cols = self.getLabelColumns() output_cols = self.getOutputCols() feature_cols = self.getFeatureColumns() metadata = self._get_metadata() def predict(rows): from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector model = deserialize(serialized_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a # dense torch tensor. data = [ torch.tensor([row[col]]).reshape(shape) for col, shape in zip(feature_cols, input_shapes) ] with torch.no_grad(): preds = model(*data) if not isinstance(preds, list) and not isinstance( preds, tuple): preds = [preds] for label_col, output_col, pred in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for dense and spark tensor is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) elif pred.shape.numel() == 1: # If the column is scalar type, int, float, etc. value = pred.item() python_type = util.spark_scalar_to_python_type( col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) else: field = DenseVector(pred.reshape(-1)) fields[output_col] = field yield Row(**fields) return df.rdd.mapPartitions(predict).toDF()
def RemoteTrainer(estimator, metadata, last_checkpoint_state, run_id, dataset_idx): # Estimator parameters loss_fns_pre_train = estimator.getLoss() loss_constructors = estimator.getLossConstructors() gradient_compression = estimator.getGradientCompression() input_shapes = estimator.getInputShapes() feature_columns = estimator.getFeatureCols() label_columns = estimator.getLabelCols() should_validate = estimator.getValidation() batch_size = estimator.getBatchSize() epochs = estimator.getEpochs() train_steps_per_epoch = estimator.getTrainStepsPerEpoch() validation_steps_per_epoch = estimator.getValidationStepsPerEpoch() sample_weight_col = estimator.getSampleWeightCol() metric_fn_groups = estimator.getMetrics() user_shuffle_buffer_size = estimator.getShufflingBufferSize() user_verbose = estimator.getVerbose() train_minibatch_fn = estimator.getTrainMinibatchFn() train_minibatch = train_minibatch_fn if train_minibatch_fn else _train_minibatch_fn( ) # If loss weight is not provided, use equal loss for all the labels label_columns = estimator.getLabelCols() loss_weights = estimator.getLossWeights() if not loss_weights: num_labels = len(label_columns) loss_weights = [float(1) / num_labels for _ in range(num_labels)] # Utility functions serialize = serialize_fn() deserialize = deserialize_fn() get_optimizer_with_unscaled_lr = _get_optimizer_with_unscaled_lr_fn() calculate_shuffle_buffer_size = _calculate_shuffle_buffer_size_fn() construct_metric_value_holders = _construct_metric_value_holders_fn() metric_cls = _metric_cls() prepare_np_data = _prepare_np_data_fn() get_metric_avgs = _get_metric_avgs_fn() update_metrics = _update_metrics_fn(metric_fn_groups) write_metrics_summary = _write_metrics_summary_fn() calculate_loss = _calculate_loss_fn() # Storage store = estimator.getStore() remote_store = store.to_remote(run_id, dataset_idx) @contextlib.contextmanager def empty_batch_reader(): yield None def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import make_batch_reader from petastorm.pytorch import DataLoader import torch import horovod.torch as hvd # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [ loss_constructor(**local_vars) for loss_constructor in loss_constructors ] # Horovod: initialize library. hvd.init() if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size cuda_available = torch.cuda.is_available() if cuda_available: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int( math.ceil(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with make_batch_reader( remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields) as train_reader: with make_batch_reader(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields) \ if should_validate else empty_batch_reader() as val_reader: train_loader = DataLoader( train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data(row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes) ] labels = [ prepare_np_data(row[col].float(), col, metadata) for col in label_columns ] sample_weights = row.get(sample_weight_col, None) if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] return inputs, labels, sample_weights def transform_outputs(outputs, labels): if type(outputs) != tuple and type(outputs) != list: outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): labels = [ label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs) ] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs( metric_value_groups) if remote_store.saving_runs: write_metrics_summary(stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print( "epoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}" .format(epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics( phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch( model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: val_loader = DataLoader(val_reader, batch_size=batch_size) val_loader_iter = iter(val_loader) if validation_steps_per_epoch is None: validation_steps = int( math.ceil( float(val_rows) / batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch( row) outputs = model(*inputs) outputs, labels = transform_outputs( outputs, labels) loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: print(epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) model.load_state_dict(best_checkpoint['model']) optimizer.load_state_dict(best_checkpoint['optimizer']) # need to move the model to cpu before serialization. Otherwise, # deserialization will fail if the machine on which the deserialization # is happening does not have gpu. model.cpu() optimizer_with_unscaled_lr = \ get_optimizer_with_unscaled_lr( hvd, optimizer, optimizer_cls, model) bio_opt = io.BytesIO() torch.save(optimizer_with_unscaled_lr, bio_opt) bio_opt.seek(0) return history, serialize(model), serialize(bio_opt) return train