def get_trainer( self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False ): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model() q_network_target = q_network.get_target_network() trainer = ParametricDQNTrainer( q_network, q_network_target, reward_network, parameters ) return trainer
def get_trainer(self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False): layers = [256, 128] activations = ["relu", "relu"] parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=layers, activations=activations, ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=layers, activations=activations, ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_target = q_network.get_target_network() param_dict = parameters.asdict() # type: ignore trainer = ParametricDQNTrainer(q_network, q_network_target, reward_network, **param_dict) return trainer
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() return ParametricDQNTrainer(q_network, q_network_target, reward_network, model, use_gpu)
def _get_sac_trainer_params(env, sac_model_params, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze( dim=0 ) max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze( dim=0 ) trainer_args = [ q1_network, value_network, value_network_target, actor_network, sac_model_params, ] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def test_save_load_batch_norm(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) # Freezing batch_norm model.eval() expected_num_params, expected_num_inputs, expected_num_outputs = 21, 2, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_save_load_batch_norm(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) # Freezing batch_norm model.eval() expected_num_params, expected_num_inputs, expected_num_outputs = 21, 2, 1 check_save_load( self, model, expected_num_params, expected_num_inputs, expected_num_outputs )
def test_parametric_wrapper(self): state_normalization_parameters = {i: _cont_norm() for i in range(1, 5)} action_normalization_parameters = {i: _cont_norm() for i in range(5, 9)} state_preprocessor = Preprocessor(state_normalization_parameters, False) action_preprocessor = Preprocessor(action_normalization_parameters, False) dqn = FullyConnectedParametricDQN( state_dim=len(state_normalization_parameters), action_dim=len(action_normalization_parameters), sizes=[16], activations=["relu"], ) dqn_with_preprocessor = ParametricDqnWithPreprocessor( dqn, state_preprocessor=state_preprocessor, action_preprocessor=action_preprocessor, ) wrapper = ParametricDqnPredictorWrapper(dqn_with_preprocessor) input_prototype = dqn_with_preprocessor.input_prototype() output_action_names, q_value = wrapper(*input_prototype) self.assertEqual(output_action_names, ["Q"]) self.assertEqual(q_value.shape, (1, 1)) expected_output = dqn( rlt.PreprocessedStateAction.from_tensors( state=state_preprocessor(*input_prototype[0]), action=action_preprocessor(*input_prototype[1]), ) ).q_value self.assertTrue((expected_output == q_value).all())
def test_basic(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) self.assertEqual((1, action_dim), input.action.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() single_q_value = model(input) self.assertEqual((1, 1), single_q_value.q_value.shape)
def get_sac_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) if parameters.constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) value_network = None if parameters.training.use_value_network: value_network = FullyConnectedNetwork( [state_dim] + parameters.value_network.layers + [1], parameters.value_network.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )
def get_td3_trainer(env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) actor_network = FullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) min_action_range_tensor_training = torch.full((1, action_dim), -1) max_action_range_tensor_training = torch.full((1, action_dim), 1) min_action_range_tensor_serving = torch.FloatTensor( env.action_space.low).unsqueeze(dim=0) max_action_range_tensor_serving = torch.FloatTensor( env.action_space.high).unsqueeze(dim=0) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) trainer_args = [q1_network, actor_network, parameters] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return TD3Trainer(*trainer_args, use_gpu=use_gpu, **trainer_kwargs)
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu: q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() trainer_parameters = ParametricDQNTrainerParameters( # type: ignore rl=model.rl, double_q_learning=model.rainbow.double_q_learning, minibatch_size=model.training.minibatch_size, optimizer=OptimizerParameters( optimizer=model.training.optimizer, learning_rate=model.training.learning_rate, l2_decay=model.training.l2_decay, ), ) return ParametricDQNTrainer( q_network, q_network_target, reward_network, use_gpu=use_gpu, **trainer_parameters.asdict() # type: ignore )
def get_modular_sarsa_trainer_exporter(self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _ParametricDQNTrainer(q_network, q_network_target, reward_network, parameters) state_preprocessor = Preprocessor(environment.normalization, False, True) action_preprocessor = Preprocessor(environment.normalization_action, False, True) feature_extractor = PredictorFeatureExtractor( state_normalization_parameters=environment.normalization, action_normalization_parameters=environment.normalization_action, ) output_transformer = ParametricActionOutputTransformer() exporter = ParametricDQNExporter( q_network, feature_extractor, output_transformer, state_preprocessor, action_preprocessor, ) return (trainer, exporter)
def test_save_load(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=False, ) expected_num_params, expected_num_inputs, expected_num_outputs = 6, 2, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def build_q_network( self, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], output_dim: int = 1, ) -> ModelBase: state_dim = get_num_output_features(state_normalization_parameters) action_dim = get_num_output_features(action_normalization_parameters) return FullyConnectedParametricDQN( state_dim=state_dim, action_dim=action_dim, sizes=self.config.sizes, activations=self.config.activations, use_batch_norm=self.config.use_batch_norm, use_layer_norm=self.config.use_layer_norm, output_dim=output_dim, )
def get_sac_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + parameters.value_network.layers + [1], parameters.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) return SACTrainer( q1_network, value_network, value_network_target, actor_network, parameters, q2_network=q2_network, )
def get_td3_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) actor_network = FullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() actor_network.cuda() return TD3Trainer( q1_network, actor_network, parameters, q2_network=q2_network, use_gpu=use_gpu, )
def test_slate_q_trainer(self): recsim = RecSim(num_users=10) # Build memory pool with random policy memory_pool = OpenAIGymMemoryPool(10000000) random_reward = recsim.rollout_policy(random_policy, memory_pool) # Train a model q_network = FullyConnectedParametricDQN( state_dim=memory_pool.state_dim, action_dim=memory_pool.action_dim, sizes=[64, 32], activations=["relu", "relu"], ) q_network = q_network.eval() recsim.reset() untrained_policy_reward = recsim.rollout_policy( partial(top_k_policy, q_network)) q_network = q_network.train() q_network_target = q_network.get_target_network() parameters = SlateQTrainerParameters() trainer = SlateQTrainer(q_network, q_network_target, parameters) for _i in range(1000): tdp = memory_pool.sample_memories( 128, model_type=ModelType.PYTORCH_PARAMETRIC_DQN.value) training_batch = tdp.as_slate_q_training_batch() trainer.train(training_batch) q_network = q_network.eval() recsim.reset() trained_policy_reward = recsim.rollout_policy( partial(top_k_policy, q_network)) print( f"Reward; random: {random_reward}; untrained: {untrained_policy_reward}; " f"trained: {trained_policy_reward}") self.assertGreater(trained_policy_reward, untrained_policy_reward) self.assertGreater(trained_policy_reward, random_reward) self.assertEqual(random_reward, 1384.0) self.assertEqual(untrained_policy_reward, 1200.0) self.assertEqual(trained_policy_reward, 1432.0)
def get_modular_sarsa_trainer_exporter( self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False ): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _ParametricDQNTrainer( q_network, q_network_target, reward_network, parameters ) feature_extractor = PredictorFeatureExtractor( state_normalization_parameters=environment.normalization, action_normalization_parameters=environment.normalization_action, ) output_transformer = ParametricActionOutputTransformer() exporter = ParametricDQNExporter( q_network, feature_extractor, output_transformer ) return (trainer, exporter)
def _get_sac_trainer_params(env: OpenAIGymEnvironment, sac_model_params: SACModelParameters, use_gpu: bool): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = None if sac_model_params.training.use_value_network: assert sac_model_params.value_network is not None value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) min_action_range_tensor_serving = ( torch.from_numpy(env.action_space.low).float().unsqueeze( dim=0) # type: ignore ) max_action_range_tensor_serving = ( torch.from_numpy(env.action_space.high).float().unsqueeze( dim=0) # type: ignore ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) trainer_args = [q1_network, actor_network, sac_model_params] trainer_kwargs = { "value_network": value_network, "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def get_sac_trainer( env: OpenAIGymEnvironment, rl_parameters: RLParameters, trainer_parameters: SACTrainerParameters, critic_training: FeedForwardParameters, actor_training: FeedForwardParameters, sac_value_training: Optional[FeedForwardParameters], use_gpu: bool, ) -> SACTrainer: assert rl_parameters == trainer_parameters.rl state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, critic_training.layers, critic_training.activations) q2_network = None # TODO: # if trainer_parameters.use_2_q_functions: # q2_network = FullyConnectedParametricDQN( # state_dim, # action_dim, # critic_training.layers, # critic_training.activations, # ) value_network = None if sac_value_training: value_network = FullyConnectedNetwork( [state_dim] + sac_value_training.layers + [1], sac_value_training.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor(state_dim, action_dim, actor_training.layers, actor_training.activations) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) min_action_range_tensor_serving = ( torch.from_numpy(env.action_space.low).float().unsqueeze( dim=0) # type: ignore ) max_action_range_tensor_serving = ( torch.from_numpy(env.action_space.high).float().unsqueeze( dim=0) # type: ignore ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) return SACTrainer( q1_network, actor_network, trainer_parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, min_action_range_tensor_training=min_action_range_tensor_training, max_action_range_tensor_training=max_action_range_tensor_training, min_action_range_tensor_serving=min_action_range_tensor_serving, max_action_range_tensor_serving=max_action_range_tensor_serving, )
def get_sac_trainer( self, env, use_gpu, use_2_q_functions=False, logged_action_uniform_prior=True, constrain_action_sum=False, use_value_network=True, ): q_network_params = FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]) value_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) actor_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, q_network_params.layers, q_network_params.activations) q2_network = None if use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, q_network_params.layers, q_network_params.activations, ) if constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) value_network = None if use_value_network: value_network = FullyConnectedNetwork( [state_dim] + value_network_params.layers + [1], value_network_params.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() parameters = SACTrainerParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5), minibatch_size=self.minibatch_size, q_network_optimizer=OptimizerParameters(), value_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), alpha_optimizer=OptimizerParameters(), logged_action_uniform_prior=logged_action_uniform_prior, ) return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )