def test_save_load(self): state_dim = 8 action_dim = 4 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"]) imitator_network = FullyConnectedNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=0.05, ) # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_basic(self): state_dim = 8 action_dim = 4 q_network = FullyConnectedDQN(state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"]) imitator_network = FullyConnectedNetwork( layers=[state_dim, 8, 4, action_dim], activations=["relu", "relu", "linear"]) model = BatchConstrainedDQN( state_dim=state_dim, q_network=q_network, imitator_network=imitator_network, bcq_drop_threshold=0.05, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) q_values = model(input) self.assertEqual((1, action_dim), q_values.q_values.shape)
def __init__(self, state_dim, action_dim, sizes, activations, use_batch_norm=False): """ AKA the multivariate beta distribution. Used in cases where actor's action must sum to 1. """ super().__init__() assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim) assert action_dim > 0, "action_dim must be > 0, got {}".format(action_dim) self.state_dim = state_dim self.action_dim = action_dim assert len(sizes) == len( activations ), "The numbers of sizes and activations must match; got {} vs {}".format( len(sizes), len(activations) ) # The last layer gives the concentration of the distribution. self.fc = FullyConnectedNetwork( [state_dim] + sizes + [action_dim], activations + ["linear"], use_batch_norm=use_batch_norm, )
def __init__( self, state_dim: int, action_dim: int, sizes: List[int], activations: List[str], scale: float = 0.05, use_batch_norm: bool = False, use_layer_norm: bool = False, ): super().__init__() assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim) assert action_dim > 0, "action_dim must be > 0, got {}".format( action_dim) self.state_dim = state_dim self.action_dim = action_dim assert len(sizes) == len( activations ), "The numbers of sizes and activations must match; got {} vs {}".format( len(sizes), len(activations)) # The last layer is mean & scale for reparameterization trick self.fc = FullyConnectedNetwork( [state_dim] + sizes + [action_dim * 2], activations + ["linear"], use_batch_norm=use_batch_norm, use_layer_norm=use_layer_norm, ) self.use_layer_norm = use_layer_norm if self.use_layer_norm: self.loc_layer_norm = torch.nn.LayerNorm(action_dim) self.scale_layer_norm = torch.nn.LayerNorm(action_dim) # used to calculate log-prob self.const = math.log(math.sqrt(2 * math.pi)) self.eps = 1e-6 self._log_min_max = (-20.0, 2.0)
class Seq2RewardTrainer(ReAgentLightningModule): """Trainer for Seq2Reward""" def __init__(self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters): super().__init__() self.seq2reward_network = seq2reward_network self.params = params # Turning off Q value output during training: self.view_q_value = params.view_q_value # permutations used to do planning self.all_permut = gen_permutations(params.multi_steps, len(self.params.action_names)) self.mse_loss = nn.MSELoss(reduction="mean") # Predict how many steps are remaining from the current step self.step_predict_network = FullyConnectedNetwork( [ self.seq2reward_network.state_dim, self.params.step_predict_net_size, self.params.step_predict_net_size, self.params.multi_steps, ], ["relu", "relu", "linear"], use_layer_norm=False, ) self.step_loss = nn.CrossEntropyLoss(reduction="mean") def configure_optimizers(self): optimizers = [] optimizers.append({ "optimizer": torch.optim.Adam(self.seq2reward_network.parameters(), lr=self.params.learning_rate), }) optimizers.append( { "optimizer": torch.optim.Adam(self.step_predict_network.parameters(), lr=self.params.learning_rate) }, ) return optimizers def train_step_gen(self, training_batch: rlt.MemoryNetworkInput, batch_idx: int): mse_loss = self.get_mse_loss(training_batch) detached_mse_loss = mse_loss.cpu().detach().item() yield mse_loss step_entropy_loss = self.get_step_entropy_loss(training_batch) detached_step_entropy_loss = step_entropy_loss.cpu().detach().item() if self.view_q_value: state_first_step = training_batch.state.float_features[0] q_values = (get_Q( self.seq2reward_network, state_first_step, self.all_permut, ).cpu().mean(0).tolist()) else: q_values = [0] * len(self.params.action_names) step_probability = (get_step_prediction( self.step_predict_network, training_batch).cpu().mean(dim=0).numpy()) logger.info( f"Seq2Reward trainer output: mse_loss={detached_mse_loss}, " f"step_entropy_loss={detached_step_entropy_loss}, q_values={q_values}, " f"step_probability={step_probability}") self.reporter.log( mse_loss=detached_mse_loss, step_entropy_loss=detached_step_entropy_loss, q_values=[q_values], ) yield step_entropy_loss # pyre-ignore inconsistent override because lightning doesn't use types def validation_step(self, batch: rlt.MemoryNetworkInput, batch_idx: int): detached_mse_loss = self.get_mse_loss(batch).cpu().detach().item() detached_step_entropy_loss = ( self.get_step_entropy_loss(batch).cpu().detach().item()) state_first_step = batch.state.float_features[0] # shape: batch_size, action_dim q_values_all_action_all_data = get_Q( self.seq2reward_network, state_first_step, self.all_permut, ).cpu() q_values = q_values_all_action_all_data.mean(0).tolist() action_distribution = torch.bincount( torch.argmax(q_values_all_action_all_data, dim=1), minlength=len(self.params.action_names), ) # normalize action_distribution = (action_distribution.float() / torch.sum(action_distribution)).tolist() self.reporter.log( eval_mse_loss=detached_mse_loss, eval_step_entropy_loss=detached_step_entropy_loss, eval_q_values=[q_values], eval_action_distribution=[action_distribution], ) return ( detached_mse_loss, detached_step_entropy_loss, q_values, action_distribution, ) def get_mse_loss(self, training_batch: rlt.MemoryNetworkInput): """ Compute losses: MSE(predicted_acc_reward, target_acc_reward) :param training_batch: training_batch has these fields: - state: (SEQ_LEN, BATCH_SIZE, STATE_DIM) torch tensor - action: (SEQ_LEN, BATCH_SIZE, ACTION_DIM) torch tensor - reward: (SEQ_LEN, BATCH_SIZE) torch tensor :returns: mse loss on reward """ # pyre-fixme[16]: Optional type has no attribute `flatten`. valid_step = training_batch.valid_step.flatten() seq2reward_output = self.seq2reward_network( training_batch.state, rlt.FeatureData(training_batch.action), valid_step, ) predicted_acc_reward = seq2reward_output.acc_reward seq_len, batch_size = training_batch.reward.size() gamma = self.params.gamma gamma_mask = (torch.Tensor([[gamma**i for i in range(seq_len)] for _ in range(batch_size)]).transpose( 0, 1).to(training_batch.reward.device)) target_acc_rewards = torch.cumsum(training_batch.reward * gamma_mask, dim=0) target_acc_reward = target_acc_rewards[ valid_step - 1, torch.arange(batch_size)].unsqueeze(1) # make sure the prediction and target tensors have the same size # the size should both be (BATCH_SIZE, 1) in this case. assert (predicted_acc_reward.size() == target_acc_reward.size() ), f"{predicted_acc_reward.size()}!={target_acc_reward.size()}" return self.mse_loss(predicted_acc_reward, target_acc_reward) def get_step_entropy_loss(self, training_batch: rlt.MemoryNetworkInput): """ Compute cross-entropy losses of step predictions :param training_batch: training_batch has these fields: - state: (SEQ_LEN, BATCH_SIZE, STATE_DIM) torch tensor - action: (SEQ_LEN, BATCH_SIZE, ACTION_DIM) torch tensor - reward: (SEQ_LEN, BATCH_SIZE) torch tensor :returns: step_entropy_loss on step prediction """ # pyre-fixme[16]: Optional type has no attribute `flatten`. valid_step = training_batch.valid_step.flatten() first_step_state = training_batch.state.float_features[0] valid_step_output = self.step_predict_network(first_step_state) # step loss's target is zero-based indexed, so subtract 1 from valid_step return self.step_loss(valid_step_output, valid_step - 1) def warm_start_components(self): components = ["seq2reward_network"] return components
def get_sac_trainer( env: OpenAIGymEnvironment, rl_parameters: RLParameters, trainer_parameters: SACTrainerParameters, critic_training: FeedForwardParameters, actor_training: FeedForwardParameters, sac_value_training: Optional[FeedForwardParameters], use_gpu: bool, ) -> SACTrainer: assert rl_parameters == trainer_parameters.rl state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, critic_training.layers, critic_training.activations) q2_network = None # TODO: # if trainer_parameters.use_2_q_functions: # q2_network = FullyConnectedParametricDQN( # state_dim, # action_dim, # critic_training.layers, # critic_training.activations, # ) value_network = None if sac_value_training: value_network = FullyConnectedNetwork( [state_dim] + sac_value_training.layers + [1], sac_value_training.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor(state_dim, action_dim, actor_training.layers, actor_training.activations) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) min_action_range_tensor_serving = ( torch.from_numpy(env.action_space.low).float().unsqueeze( dim=0) # type: ignore ) max_action_range_tensor_serving = ( torch.from_numpy(env.action_space.high).float().unsqueeze( dim=0) # type: ignore ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) return SACTrainer( q1_network, actor_network, trainer_parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, min_action_range_tensor_training=min_action_range_tensor_training, max_action_range_tensor_training=max_action_range_tensor_training, min_action_range_tensor_serving=min_action_range_tensor_serving, max_action_range_tensor_serving=max_action_range_tensor_serving, )
class ConvolutionalNetwork(nn.Module): def __init__(self, cnn_parameters, layers, activations, use_layer_norm) -> None: super().__init__() self.conv_dims = cnn_parameters.conv_dims self.conv_height_kernels = cnn_parameters.conv_height_kernels self.conv_width_kernels = cnn_parameters.conv_width_kernels self.use_layer_norm = use_layer_norm self.conv_layers: nn.ModuleList = nn.ModuleList() self.pool_layers: nn.ModuleList = nn.ModuleList() self.layer_norm_layers: nn.ModuleList = nn.ModuleList() for i, _ in enumerate(self.conv_dims[1:]): self.conv_layers.append( nn.Conv2d( self.conv_dims[i], self.conv_dims[i + 1], kernel_size=( self.conv_height_kernels[i], self.conv_width_kernels[i], ), )) nn.init.kaiming_normal_(self.conv_layers[i].weight) if cnn_parameters.pool_types[i] == "max": self.pool_layers.append( nn.MaxPool2d( kernel_size=cnn_parameters.pool_kernels_strides[i])) else: assert False, "Unknown pooling type".format(layers) if self.use_layer_norm: self.layer_norm_layers.append( nn.GroupNorm(1, self.conv_dims[i + 1])) input_size = ( cnn_parameters.num_input_channels, cnn_parameters.input_height, cnn_parameters.input_width, ) conv_out = self.conv_forward(torch.ones(1, *input_size)) self.fc_input_dim = int(np.prod(conv_out.size()[1:])) layers[0] = self.fc_input_dim self.feed_forward = FullyConnectedNetwork( layers, activations, use_layer_norm=use_layer_norm) def conv_forward(self, input): x = input for i, _ in enumerate(self.conv_layers): x = self.conv_layers[i](x) if self.use_layer_norm: x = self.layer_norm_layers[i](x) x = F.relu(x) x = self.pool_layers[i](x) return x def forward(self, input) -> torch.FloatTensor: """Forward pass for generic convnet DNNs. Assumes activation names are valid pytorch activation names. :param input image tensor """ x = self.conv_forward(input) x = x.view(-1, self.fc_input_dim) # pyre-fixme[7]: Expected `FloatTensor` but got `Tensor`. return self.feed_forward.forward(x)
def get_sac_trainer( self, env, use_gpu, use_2_q_functions=False, logged_action_uniform_prior=True, constrain_action_sum=False, use_value_network=True, use_alpha_optimizer=True, entropy_temperature=None, ): q_network_params = FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]) value_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) actor_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, q_network_params.layers, q_network_params.activations) q2_network = None if use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, q_network_params.layers, q_network_params.activations, ) if constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) value_network = None if use_value_network: value_network = FullyConnectedNetwork( [state_dim] + value_network_params.layers + [1], value_network_params.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() parameters = SACTrainerParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5), minibatch_size=self.minibatch_size, q_network_optimizer=OptimizerParameters(), value_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), alpha_optimizer=OptimizerParameters() if use_alpha_optimizer else None, entropy_temperature=entropy_temperature, logged_action_uniform_prior=logged_action_uniform_prior, ) return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )
class Seq2RewardTrainer(Trainer): """ Trainer for Seq2Reward """ def __init__( self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters ): self.seq2reward_network = seq2reward_network self.params = params self.mse_optimizer = torch.optim.Adam( self.seq2reward_network.parameters(), lr=params.learning_rate ) self.minibatch_size = self.params.batch_size self.loss_reporter = NoOpLossReporter() # PageHandler must use this to activate evaluator: self.calc_cpe_in_training = True # Turning off Q value output during training: self.view_q_value = params.view_q_value # permutations used to do planning self.all_permut = gen_permutations( params.multi_steps, len(self.params.action_names) ) self.mse_loss = nn.MSELoss(reduction="mean") # Predict how many steps are remaining from the current step self.step_predict_network = FullyConnectedNetwork( [ self.seq2reward_network.state_dim, self.params.step_predict_net_size, self.params.step_predict_net_size, self.params.multi_steps, ], ["relu", "relu", "linear"], use_layer_norm=False, ) self.step_loss = nn.CrossEntropyLoss(reduction="mean") self.step_optimizer = torch.optim.Adam( self.step_predict_network.parameters(), lr=params.learning_rate ) def train(self, training_batch: rlt.MemoryNetworkInput): mse_loss, step_entropy_loss = self.get_loss(training_batch) self.mse_optimizer.zero_grad() mse_loss.backward() self.mse_optimizer.step() self.step_optimizer.zero_grad() step_entropy_loss.backward() self.step_optimizer.step() detached_mse_loss = mse_loss.cpu().detach().item() detached_step_entropy_loss = step_entropy_loss.cpu().detach().item() if self.view_q_value: state_first_step = training_batch.state.float_features[0] q_values = ( get_Q( self.seq2reward_network, state_first_step, self.all_permut, ) .cpu() .mean(0) .tolist() ) else: q_values = [0] * len(self.params.action_names) step_probability = ( get_step_prediction(self.step_predict_network, training_batch) .cpu() .mean(dim=0) .numpy() ) logger.info( f"Seq2Reward trainer output: mse_loss={detached_mse_loss}, " f"step_entropy_loss={detached_step_entropy_loss}, q_values={q_values}, " f"step_probability={step_probability}" ) # pyre-fixme[16]: `Seq2RewardTrainer` has no attribute `notify_observers`. self.notify_observers( mse_loss=detached_mse_loss, step_entropy_loss=detached_step_entropy_loss, q_values=[q_values], ) return (detached_mse_loss, detached_step_entropy_loss, q_values) def get_loss(self, training_batch: rlt.MemoryNetworkInput): """ Compute losses: MSE(predicted_acc_reward, target_acc_reward) :param training_batch: training_batch has these fields: - state: (SEQ_LEN, BATCH_SIZE, STATE_DIM) torch tensor - action: (SEQ_LEN, BATCH_SIZE, ACTION_DIM) torch tensor - reward: (SEQ_LEN, BATCH_SIZE) torch tensor :returns: mse loss on reward step_entropy_loss on step prediction """ # pyre-fixme[16]: Optional type has no attribute `flatten`. valid_reward_len = training_batch.valid_next_seq_len.flatten() first_step_state = training_batch.state.float_features[0] valid_reward_len_output = self.step_predict_network(first_step_state) step_entropy_loss = self.step_loss( valid_reward_len_output, valid_reward_len - 1 ) seq2reward_output = self.seq2reward_network( training_batch.state, rlt.FeatureData(training_batch.action), valid_reward_len, ) predicted_acc_reward = seq2reward_output.acc_reward seq_len, batch_size = training_batch.reward.size() gamma = self.params.gamma gamma_mask = ( torch.Tensor( [[gamma ** i for i in range(seq_len)] for _ in range(batch_size)] ) .transpose(0, 1) .to(training_batch.reward.device) ) target_acc_rewards = torch.cumsum(training_batch.reward * gamma_mask, dim=0) target_acc_reward = target_acc_rewards[ valid_reward_len - 1, torch.arange(batch_size) ].unsqueeze(1) # make sure the prediction and target tensors have the same size # the size should both be (BATCH_SIZE, 1) in this case. assert ( predicted_acc_reward.size() == target_acc_reward.size() ), f"{predicted_acc_reward.size()}!={target_acc_reward.size()}" mse = self.mse_loss(predicted_acc_reward, target_acc_reward) return mse, step_entropy_loss def warm_start_components(self): components = ["seq2reward_network"] return components