Esempio n. 1
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
     self.middleware_parameters = FCMiddlewareParameters()
     self.heads_parameters = [VHeadParameters(loss_weight=0.5), PolicyHeadParameters(loss_weight=1.0)]
     self.optimizer_type = 'Adam'
     self.clip_gradients = 40.0
     self.async_training = True
Esempio n. 2
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {
         'observation': InputEmbedderParameters(activation_function='tanh')
     }
     self.middleware_parameters = FCMiddlewareParameters(
         activation_function='tanh')
     self.heads_parameters = [VHeadParameters()]
     self.async_training = True
     self.l2_regularization = 0
     self.create_target_network = True
     self.batch_size = 128
Esempio n. 3
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='relu')}
     self.middleware_parameters = FCMiddlewareParameters(activation_function='relu')
     self.heads_parameters = [VHeadParameters(initializer='xavier')]
     self.rescale_gradient_from_head_by_factor = [1]
     self.optimizer_type = 'Adam'
     self.batch_size = 256
     self.async_training = False
     self.learning_rate = 0.0003     # 3e-4 see appendix D in the paper
     # tau is set in SoftActorCriticAlgorithmParameters.rate_for_copying_weights_to_target
     self.create_target_network = True
Esempio n. 4
0
 def __init__(self):
     super().__init__()
     self.input_embedders_parameters = {
         'observation': InputEmbedderParameters(batchnorm=True),
         'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)
     }
     self.middleware_parameters = FCMiddlewareParameters()
     self.heads_parameters = [VHeadParameters()]
     self.optimizer_type = 'Adam'
     self.batch_size = 64
     self.async_training = False
     self.learning_rate = 0.001
     self.create_target_network = True
     self.shared_optimizer = True
     self.scale_down_gradients_by_number_of_workers_for_sync_training = False
Esempio n. 5
0
    def __init__(self):
        super().__init__()
        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
        self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
        self.heads_parameters = [VHeadParameters(), PPOHeadParameters()]
        self.batch_size = 64
        self.optimizer_type = 'Adam'
        self.clip_gradients = None
        self.use_separate_networks_per_head = True
        self.async_training = False
        self.l2_regularization = 0

        # The target network is used in order to freeze the old policy, while making updates to the new one
        # in train_network()
        self.create_target_network = True
        self.shared_optimizer = True
        self.scale_down_gradients_by_number_of_workers_for_sync_training = True