def _create_rl_train_net(self) -> None: self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id) C2.set_model(self.rl_train_model) if self.maxq_learning: next_q_values = self.get_max_q_values( 'next_states', self.get_possible_next_actions(), True, ) else: next_q_values = self.get_q_values('next_states', 'next_actions', True) q_vals_target = C2.Add( 'rewards', C2.Mul( C2.Mul( C2.Cast('not_terminals', to=caffe2_pb2.TensorProto.FLOAT), # type: ignore self.rl_discount_rate, broadcast=1, ), next_q_values)) self.update_model('states', 'actions', q_vals_target) workspace.RunNetOnce(self.rl_train_model.param_init_net) workspace.CreateNet(self.rl_train_model.net) C2.set_model(None)
def _create_rl_train_net(self) -> None: self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id) C2.set_model(self.rl_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice( "actions", starts=[0, action_index], ends=[-1, action_index + 1] ), boost, broadcast=1, ) C2.net().Sum(["rewards", action_boost], ["rewards"]) if self.maxq_learning: next_q_values = self.get_max_q_values( "next_states", self.get_possible_next_actions(), True ) else: next_q_values = self.get_q_values("next_states", "next_actions", True) discount_blob = C2.ConstantFill("time_diff", value=self.rl_discount_rate) if self.use_seq_num_diff_as_time_diff: time_diff_adjusted_discount_blob = C2.Pow( discount_blob, C2.Cast("time_diff", to=caffe2_pb2.TensorProto.FLOAT) ) else: time_diff_adjusted_discount_blob = discount_blob q_vals_target = C2.Add( "rewards", C2.Mul( C2.Mul( C2.Cast( "not_terminals", to=caffe2_pb2.TensorProto.FLOAT ), # type: ignore time_diff_adjusted_discount_blob, broadcast=1, ), next_q_values, ), ) self.update_model("states", "actions", q_vals_target) workspace.RunNetOnce(self.rl_train_model.param_init_net) self.rl_train_model.net.Proto().num_workers = ( RLTrainer.DEFAULT_TRAINING_NUM_WORKERS ) self.rl_train_model.net.Proto().type = "async_scheduling" workspace.CreateNet(self.rl_train_model.net) C2.set_model(None)
def get_max_q_values(self, states: str, possible_actions: str, use_target_network: bool) -> str: """ Takes in an array of states and outputs an array of the same shape whose ith entry = max_{pna} Q(state_i, pna). :param states: Numpy array with shape (batch_size, state_dim). Each row contains a representation of a state. :param possible_next_actions: Numpy array with shape (batch_size, action_dim). possible_next_actions[i][j] = 1 iff the agent can take action j from state i. :use_target_network: Boolean that indicates whether or not to use this trainer's TargetNetwork to compute Q values. """ q_values = self.get_q_values_all_actions(states, use_target_network) # Set the q values of impossible actions to a very large negative # number. inverse_pna = C2.ConstantFill(possible_actions, value=1.0) possible_actions_float = C2.Cast(possible_actions, to=core.DataType.FLOAT) inverse_pna = C2.Sub(inverse_pna, possible_actions_float) inverse_pna = C2.Mul(inverse_pna, self.ACTION_NOT_POSSIBLE_VAL, broadcast=1) q_values = C2.Add(q_values, inverse_pna) q_values_max = C2.ReduceBackMax(q_values, num_reduce_dims=1) return C2.ExpandDims(q_values_max, dims=[1])
def update_model(self, states: str, actions: str, q_vals_target: str) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets Updates Q Network's weights according to loss and optimizer :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row contains the one-hotted representation of the ith action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ model = C2.model() q_vals_target = C2.StopGradient(q_vals_target) output_blob = C2.NextBlob("train_output") if self.conv_ml_trainer is not None: conv_output_blob = C2.NextBlob("conv_output") self.conv_ml_trainer.make_conv_pass_ops(model, states, conv_output_blob) states = conv_output_blob self.ml_trainer.make_forward_pass_ops(model, states, output_blob, False) q_val_select = C2.ReduceBackSum(C2.Mul(output_blob, actions)) q_values = C2.ExpandDims(q_val_select, dims=[1]) self.loss_blob = self.ml_trainer.generateLossOps(model, q_values, q_vals_target) model.AddGradientOperators([self.loss_blob]) for param in model.params: if param in model.param_to_grad: param_grad = model.param_to_grad[param] param_grad = C2.NanCheck(param_grad) self.ml_trainer.addParameterUpdateOps(model)
def update_model( self, states: str, actions: str, q_vals_target: str, ) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets Updates Q Network's weights according to loss and optimizer :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row contains the one-hotted representation of the ith action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ model = C2.model() q_vals_target = C2.StopGradient(q_vals_target) output_blob = C2.NextBlob("train_output") MakeForwardPassOps( model, self.model_id, states, output_blob, self.weights, self.biases, self.activations, self.layers, self.dropout_ratio, False, ) q_val_select = C2.ReduceBackSum(C2.Mul(output_blob, actions)) q_values = C2.ExpandDims(q_val_select, dims=[1]) self.loss_blob = GenerateLossOps( model, q_values, q_vals_target, ) model.AddGradientOperators([self.loss_blob]) for param in model.params: if param in model.param_to_grad: param_grad = model.param_to_grad[param] param_grad = C2.NanCheck(param_grad) AddParameterUpdateOps( model, optimizer_input=self.optimizer, base_learning_rate=self.learning_rate, gamma=self.gamma, policy=self.lr_policy, )
def _create_rl_train_net(self) -> None: self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id) C2.set_model(self.rl_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice( 'actions', starts=[0, action_index], ends=[-1, action_index + 1], ), boost, broadcast=1, ) C2.net().Sum(['rewards', action_boost], ['rewards']) if self.maxq_learning: next_q_values = self.get_max_q_values( 'next_states', self.get_possible_next_actions(), True, ) else: next_q_values = self.get_q_values('next_states', 'next_actions', True) q_vals_target = C2.Add( 'rewards', C2.Mul( C2.Mul( C2.Cast('not_terminals', to=caffe2_pb2.TensorProto.FLOAT), # type: ignore self.rl_discount_rate, broadcast=1, ), next_q_values)) self.update_model('states', 'actions', q_vals_target) workspace.RunNetOnce(self.rl_train_model.param_init_net) workspace.CreateNet(self.rl_train_model.net) C2.set_model(None)
def _create_rl_train_net(self) -> None: self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id) C2.set_model(self.rl_train_model) if self.maxq_learning: next_q_values = self.get_max_q_values( 'next_states', self.get_possible_next_actions(), True, ) else: next_q_values = self.get_q_values('next_states', 'next_actions', True) discount_blob = C2.ConstantFill("time_diff", value=self.rl_discount_rate) time_diff_adjusted_discount_blob = C2.Pow( discount_blob, C2.Cast("time_diff", to=caffe2_pb2.TensorProto.FLOAT)) q_vals_target = C2.Add( "rewards", C2.Mul( C2.Mul( C2.Cast("not_terminals", to=caffe2_pb2.TensorProto.FLOAT), # type: ignore time_diff_adjusted_discount_blob, broadcast=1, ), next_q_values, ), ) self.update_model('states', 'actions', q_vals_target) workspace.RunNetOnce(self.rl_train_model.param_init_net) self.rl_train_model.net.Proto().num_workers = \ RLTrainer.DEFAULT_TRAINING_NUM_WORKERS workspace.CreateNet(self.rl_train_model.net) C2.set_model(None)
def _create_reward_train_net(self) -> None: self.reward_train_model = ModelHelper(name="reward_train_" + self.model_id) C2.set_model(self.reward_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice("actions", starts=[0, action_index], ends=[-1, action_index + 1]), boost, broadcast=1, ) C2.net().Sum(["rewards", action_boost], ["rewards"]) self.update_model("states", "actions", "rewards") workspace.RunNetOnce(self.reward_train_model.param_init_net) workspace.CreateNet(self.reward_train_model.net) C2.set_model(None)
def _create_reward_train_net(self) -> None: self.reward_train_model = ModelHelper(name="reward_train_" + self.model_id) C2.set_model(self.reward_train_model) if self.reward_shape is not None: for action_index, boost in self.reward_shape.items(): action_boost = C2.Mul( C2.Slice("actions", starts=[0, action_index], ends=[-1, action_index + 1]), boost, broadcast=1, ) C2.net().Sum(["rewards", action_boost], ["rewards"]) self.update_model("states", "actions", "rewards") workspace.RunNetOnce(self.reward_train_model.param_init_net) self.reward_train_model.net.Proto().num_workers = ( RLTrainer.DEFAULT_TRAINING_NUM_WORKERS) self.reward_train_model.net.Proto().type = "async_scheduling" workspace.CreateNet(self.reward_train_model.net) C2.set_model(None)
def export_actor( cls, trainer, state_normalization_parameters, action_feature_ids, min_action_range_tensor_serving, max_action_range_tensor_serving, int_features=False, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch actor forward pass as one caffe2 net. :param trainer DDPGTrainer :param state_normalization_parameters state NormalizationParameters :param min_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param max_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param state_normalization_parameters state NormalizationParameters :param int_features boolean indicating if int features blob will be present :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) parameters: List[str] = [] workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" if int_features: workspace.FeedBlob( "input/int_features.lengths", np.zeros(1, dtype=np.int32) ) workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32)) C2.net().Cast( ["input/int_features.values"], ["input/int_features.values_float"], dtype=caffe2_pb2.TensorProto.FLOAT, ) C2.net().MergeMultiScalarFeatureTensors( [ "input/float_features.lengths", "input/float_features.keys", "input/float_features.values", "input/int_features.lengths", "input/int_features.keys", "input/int_features.values_float", ], [input_feature_lengths, input_feature_keys, input_feature_values], ) else: C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet() sorted_features, _ = sort_features_by_normalization( state_normalization_parameters ) state_dense_matrix, new_parameters = sparse_to_dense( input_feature_lengths, input_feature_keys, input_feature_values, sorted_features, ) parameters.extend(new_parameters) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix( state_dense_matrix, sorted_features, state_normalization_parameters, "state_norm", False, ) parameters.extend(new_parameters) torch_init_net, torch_predict_net, new_parameters, actor_input_blob, actor_output_blob, min_action_training_blob, max_action_training_blob, min_action_serving_blob, max_action_serving_blob = DDPGPredictor.generate_train_net( trainer, model, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu, ) parameters.extend(new_parameters) net.Copy([state_normalized_dense_matrix], [actor_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) # Scale actors actions from [-1, 1] to serving range prev_range = C2.Sub(max_action_training_blob, min_action_training_blob) new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob) subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob) div_by_prev_range = C2.Div(subtract_prev_min, prev_range) scaled_for_serving_actions = C2.Add( C2.Mul(div_by_prev_range, new_range), min_action_serving_blob ) output_lengths = "output/float_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [C2.FlattenToVec(C2.ArgMax(actor_output_blob))], [output_lengths], value=trainer.actor.layers[-1].out_features, dtype=caffe2_pb2.TensorProto.INT32, ) action_feature_ids_blob = C2.NextBlob("action_feature_ids") workspace.FeedBlob( action_feature_ids_blob, np.array(action_feature_ids, dtype=np.int64) ) parameters.append(action_feature_ids_blob) output_keys = "output/float_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64)) num_examples, _ = C2.Reshape(C2.Size("input/float_features.lengths"), shape=[1]) C2.net().Tile([action_feature_ids_blob, num_examples], [output_keys], axis=1) output_values = "output/float_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32)) C2.net().FlattenToVec([scaled_for_serving_actions], [output_values]) workspace.CreateNet(net) return DDPGPredictor(net, torch_init_net, parameters, int_features)
def preprocess_blob(self, blob, normalization_parameters): """ Takes in a blob and its normalization parameters. Outputs a tuple whose first element is a blob containing the normalized input blob and whose second element contains all the parameter blobs used to create it. Call this from a CPU context and ensure the input blob exists in it. """ parameters: List[str] = [] ZERO = self._store_parameter(parameters, "ZERO", np.array([0], dtype=np.float32)) ONE = self._store_parameter(parameters, "ONE", np.array([1], dtype=np.float32)) NEGATIVE_ONE = self._store_parameter(parameters, "NEGATIVE_ONE", np.array([-1], dtype=np.float32)) MISSING_U = self._store_parameter( parameters, "MISSING_U", np.array([MISSING_VALUE + 1e-4], dtype=np.float32)) MISSING_L = self._store_parameter( parameters, "MISSING_L", np.array([MISSING_VALUE - 1e-4], dtype=np.float32)) is_empty_l = C2.GT(blob, MISSING_L, broadcast=1) is_empty_u = C2.LT(blob, MISSING_U, broadcast=1) is_empty = C2.And(is_empty_l, is_empty_u) for i in range(len(normalization_parameters) - 1): if (normalization_parameters[i].feature_type != normalization_parameters[i + 1].feature_type): raise Exception( "Only one feature type is allowed per call to preprocess_blob!" ) feature_type = normalization_parameters[0].feature_type if feature_type == identify_types.BINARY: TOLERANCE = self._store_parameter(parameters, "TOLERANCE", np.array(1e-3, dtype=np.float32)) is_gt_zero = C2.GT(blob, C2.Add(ZERO, TOLERANCE, broadcast=1), broadcast=1) is_lt_zero = C2.LT(blob, C2.Sub(ZERO, TOLERANCE, broadcast=1), broadcast=1) bool_blob = C2.Or(is_gt_zero, is_lt_zero) blob = C2.Cast(bool_blob, to=caffe2_pb2.TensorProto.FLOAT) elif feature_type == identify_types.PROBABILITY: clipped = C2.Clip(blob, min=0.01, max=0.99) blob = C2.Mul( C2.Log(C2.Sub(C2.Pow(clipped, exponent=-1.0), ONE, broadcast=1)), NEGATIVE_ONE, broadcast=1, ) elif feature_type == identify_types.ENUM: for parameter in normalization_parameters: possible_values = parameter.possible_values for x in possible_values: if x < 0: logger.fatal( "Invalid enum possible value for feature: " + str(x) + " " + str(parameter.possible_values)) raise Exception( "Invalid enum possible value for feature " + blob + ": " + str(x) + " " + str(parameter.possible_values)) int_blob = C2.Cast(blob, to=core.DataType.INT32) # Batch one hot transform with MISSING_VALUE as a possible value feature_lengths = [ len(p.possible_values) + 1 for p in normalization_parameters ] feature_lengths_blob = self._store_parameter( parameters, "feature_lengths_blob", np.array(feature_lengths, dtype=np.int32), ) feature_values = [ x for p in normalization_parameters for x in p.possible_values + [int(MISSING_VALUE)] ] feature_values_blob = self._store_parameter( parameters, "feature_values_blob", np.array(feature_values, dtype=np.int32), ) one_hot_output = C2.BatchOneHot(int_blob, feature_lengths_blob, feature_values_blob) flattened_one_hot = C2.FlattenToVec(one_hot_output) # Remove missing values with a mask cols_to_include = [[1] * len(p.possible_values) + [0] for p in normalization_parameters] cols_to_include = [x for col in cols_to_include for x in col] mask = self._store_parameter( parameters, "mask", np.array(cols_to_include, dtype=np.int32)) zero_vec = C2.ConstantFill(one_hot_output, value=0, dtype=caffe2_pb2.TensorProto.INT32) repeated_mask_bool = C2.Cast(C2.Add(zero_vec, mask, broadcast=1), to=core.DataType.BOOL) flattened_repeated_mask = C2.FlattenToVec(repeated_mask_bool) flattened_one_hot_proc = C2.NextBlob("flattened_one_hot_proc") flattened_one_hot_proc_indices = C2.NextBlob( "flattened_one_hot_proc_indices") C2.net().BooleanMask( [flattened_one_hot, flattened_repeated_mask], [flattened_one_hot_proc, flattened_one_hot_proc_indices], ) one_hot_shape = C2.Shape(one_hot_output) shape_delta = self._store_parameter( parameters, "shape_delta", np.array([0, len(normalization_parameters)], dtype=np.int64), ) target_shape = C2.Sub(one_hot_shape, shape_delta, broadcast=1) output_int_blob = C2.NextBlob("output_int_blob") output_int_blob_old_shape = C2.NextBlob( "output_int_blob_old_shape") C2.net().Reshape( [flattened_one_hot_proc, target_shape], [output_int_blob, output_int_blob_old_shape], ) output_blob = C2.Cast(output_int_blob, to=core.DataType.FLOAT) return output_blob, parameters elif feature_type == identify_types.QUANTILE: # This transformation replaces a set of values with their quantile. # The quantile boundaries are provided in the normalization params. quantile_sizes = [ len(norm.quantiles) for norm in normalization_parameters ] num_boundaries_blob = self._store_parameter( parameters, "num_boundaries_blob", np.array(quantile_sizes, dtype=np.int32), ) quantile_values = np.array([], dtype=np.float32) quantile_labels = np.array([], dtype=np.float32) for norm in normalization_parameters: quantile_values = np.append( quantile_values, np.array(norm.quantiles, dtype=np.float32)) quantile_labels = np.append( quantile_labels, np.arange(len(norm.quantiles), dtype=np.float32) / float(len(norm.quantiles) - 1.0), ) quantiles = np.vstack([quantile_values, quantile_labels]).T quantiles_blob = self._store_parameter(parameters, "quantiles_blob", quantiles) quantile_blob = C2.Percentile(blob, quantiles_blob, num_boundaries_blob) blob = quantile_blob elif (feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX): boxcox_shifts = [] boxcox_lambdas = [] means = [] stddevs = [] for norm in normalization_parameters: if feature_type == identify_types.BOXCOX: assert (norm.boxcox_shift is not None and norm.boxcox_lambda is not None) boxcox_shifts.append(norm.boxcox_shift) boxcox_lambdas.append(norm.boxcox_lambda) means.append(norm.mean) stddevs.append(norm.stddev) if feature_type == identify_types.BOXCOX: boxcox_shift_blob = self._store_parameter( parameters, "boxcox_shift", np.array(boxcox_shifts, dtype=np.float32), ) boxcox_lambda_blob = self._store_parameter( parameters, "boxcox_shift", np.array(boxcox_lambdas, dtype=np.float32), ) blob = C2.BatchBoxCox(blob, boxcox_lambda_blob, boxcox_shift_blob) means_blob = self._store_parameter( parameters, "means_blob", np.array([means], dtype=np.float32)) stddevs_blob = self._store_parameter( parameters, "stddevs_blob", np.array([stddevs], dtype=np.float32)) blob = C2.Sub(blob, means_blob, broadcast=1, axis=0) blob = C2.Div(blob, stddevs_blob, broadcast=1, axis=0) if self.clip_anomalies: blob = C2.Clip(blob, min=-5.0, max=5.0) else: raise NotImplementedError( "Invalid feature type: {}".format(feature_type)) zeros = C2.ConstantFill(blob, value=0.0) output_blob = C2.Where(is_empty, zeros, blob) return output_blob, parameters
def export_actor( cls, trainer, state_normalization_parameters, min_action_range_tensor_serving, max_action_range_tensor_serving, int_features=False, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch actor forward pass as one caffe2 net. :param trainer DDPGTrainer :param state_normalization_parameters state NormalizationParameters :param min_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param max_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param state_normalization_parameters state NormalizationParameters :param int_features boolean indicating if int features blob will be present :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ input_dim = trainer.state_dim buffer = PytorchCaffe2Converter.pytorch_net_to_buffer( trainer.actor, input_dim, model_on_gpu ) actor_input_blob, actor_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef( buffer ) torch_workspace = caffe2_netdef.workspace parameters = torch_workspace.Blobs() for blob_str in parameters: workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str)) torch_init_net = core.Net(caffe2_netdef.init_net) torch_predict_net = core.Net(caffe2_netdef.predict_net) model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) # Feed action scaling tensors for serving min_action_serving_blob = C2.NextBlob("min_action_range_tensor_serving") workspace.FeedBlob( min_action_serving_blob, min_action_range_tensor_serving.cpu().data.numpy() ) parameters.append(str(min_action_serving_blob)) max_action_serving_blob = C2.NextBlob("max_action_range_tensor_serving") workspace.FeedBlob( max_action_serving_blob, max_action_range_tensor_serving.cpu().data.numpy() ) parameters.append(str(max_action_serving_blob)) # Feed action scaling tensors for training [-1, 1] due to tanh actor min_vals_training = trainer.min_action_range_tensor_training.cpu().data.numpy() min_action_training_blob = C2.NextBlob("min_action_range_tensor_training") workspace.FeedBlob(min_action_training_blob, min_vals_training) parameters.append(str(min_action_training_blob)) max_vals_training = trainer.max_action_range_tensor_training.cpu().data.numpy() max_action_training_blob = C2.NextBlob("max_action_range_tensor_training") workspace.FeedBlob(max_action_training_blob, max_vals_training) parameters.append(str(max_action_training_blob)) workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" if int_features: workspace.FeedBlob( "input/int_features.lengths", np.zeros(1, dtype=np.int32) ) workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32)) C2.net().Cast( ["input/int_features.values"], ["input/int_features.values_float"], dtype=caffe2_pb2.TensorProto.FLOAT, ) C2.net().MergeMultiScalarFeatureTensors( [ "input/float_features.lengths", "input/float_features.keys", "input/float_features.values", "input/int_features.lengths", "input/int_features.keys", "input/int_features.values_float", ], [input_feature_lengths, input_feature_keys, input_feature_values], ) else: C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet(True) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix( input_feature_lengths, input_feature_keys, input_feature_values, state_normalization_parameters, "state_norm", False, False, ) parameters.extend(new_parameters) net.Copy([state_normalized_dense_matrix], [actor_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) C2.FlattenToVec(C2.ArgMax(actor_output_blob)) output_lengths = "output/float_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [C2.FlattenToVec(C2.ArgMax(actor_output_blob))], [output_lengths], value=trainer.actor.layers[-1].out_features, dtype=caffe2_pb2.TensorProto.INT32, ) output_keys = "output/float_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int32)) C2.net().LengthsRangeFill([output_lengths], [output_keys]) output_values = "output/float_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32)) # Scale actors actions from [-1, 1] to serving range prev_range = C2.Sub(max_action_training_blob, min_action_training_blob) new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob) subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob) div_by_prev_range = C2.Div(subtract_prev_min, prev_range) scaled_for_serving_actions = C2.Add( C2.Mul(div_by_prev_range, new_range), min_action_serving_blob ) C2.net().FlattenToVec([scaled_for_serving_actions], [output_values]) workspace.CreateNet(net) return DDPGPredictor(net, parameters, int_features)