Exemple #1
0
  def __init__(self,
               observation_spec,
               action_spec,
               actor_lr = 3e-4,
               critic_lr = 3e-4,
               discount = 0.99,
               tau = 0.005,
               f = 'bin_max',
               temperature = 0.05):
    """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      f: Advantage transformation.
      temperature: Temperature parameter.
    """
    assert len(observation_spec.shape) == 1
    state_dim = observation_spec.shape[0]

    self.actor = policies.DiagGuassianPolicy(state_dim, action_spec)
    self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

    self.critic_learner = critic.CriticLearner(state_dim, action_spec.shape[0],
                                               critic_lr, discount, tau)

    self.f = f
    self.temperature = temperature
Exemple #2
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 cross_norm=False):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      cross_norm: Whether to fit cross norm critic.
    """
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]

        if cross_norm:
            beta_1 = 0.0
        else:
            beta_1 = 0.9

        self.actor = policies.DeterministicPolicy(state_dim, action_spec, 0.3)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,
                                                        beta_1=beta_1)

        if cross_norm:
            self.critic_learner = critic.CrossNormCriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau)
        else:
            self.critic_learner = critic.CriticLearner(state_dim,
                                                       action_spec.shape[0],
                                                       critic_lr, discount,
                                                       tau)
Exemple #3
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 num_augmentations=0):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      num_augmentations: number of DrQ-style random crops
    """
        del num_augmentations
        self.bc = None
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]

        self.actor = policies.CVAEPolicy(state_dim, action_spec,
                                         action_spec.shape[0] * 2)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

        self.critic_learner = critic.CriticLearner(state_dim,
                                                   action_spec.shape[0],
                                                   critic_lr, discount, tau)

        self.model_dict = {
            'critic_learner': self.critic_learner,
            'actor': self.actor,
            'actor_optimizer': self.actor_optimizer
        }
  def __init__(self,
               observation_spec,
               action_spec,
               actor_lr = 3e-4,
               critic_lr = 3e-4,
               discount = 0.99,
               tau = 0.005,
               num_augmentations = 1):
    """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      num_augmentations: Number of DrQ-style augmentations to perform on pixels
    """

    self.num_augmentations = num_augmentations
    self.discrete_actions = False if len(action_spec.shape) else True

    actor_kwargs = {}
    critic_kwargs = {}

    if observation_spec.shape == (64, 64, 3):
      # IMPALA for Procgen
      def conv_stack():
        return make_impala_cnn_network(
            depths=[16, 32, 32], use_batch_norm=False, dropout_rate=0.)

      state_dim = 256
    else:
      # Reduced architecture for DMC
      def conv_stack():
        return ConvStack(observation_spec.shape)
      state_dim = 50

    conv_stack_actor = conv_stack()
    conv_stack_critic = conv_stack()
    conv_target_stack_critic = conv_stack()

    if observation_spec.shape == (64, 64, 3):
      conv_stack_actor.output_size = state_dim
      conv_stack_critic.output_size = state_dim
      conv_target_stack_critic.output_size = state_dim
    # Combine and stop_grad some of the above conv stacks
    actor_kwargs['encoder'] = ImageEncoder(
        conv_stack_actor, feature_dim=state_dim, bprop_conv_stack=True)
    critic_kwargs['encoder'] = ImageEncoder(
        conv_stack_critic, feature_dim=state_dim, bprop_conv_stack=True)
    # Note: the target critic does not share any weights.
    critic_kwargs['encoder_target'] = ImageEncoder(
        conv_target_stack_critic, feature_dim=state_dim, bprop_conv_stack=True)

    if self.num_augmentations == 0:
      dummy_state = tf.constant(
          np.zeros(shape=[1] + list(observation_spec.shape)))
    else:  # account for padding of +4 everywhere and then cropping out 68
      dummy_state = tf.constant(np.zeros(shape=[1, 68, 68, 3]))

    @tf.function
    def init_models():
      actor_kwargs['encoder'](dummy_state)
      critic_kwargs['encoder'](dummy_state)
      critic_kwargs['encoder_target'](dummy_state)

    init_models()

    if self.discrete_actions:
      action_dim = action_spec.maximum.item() + 1
      self.actor = policies.CVAEPolicyPixelsDiscrete(
          state_dim,
          action_spec,
          action_dim * 2,
          encoder=actor_kwargs['encoder'])

    else:
      action_dim = action_spec.shape[0]
      self.actor = policies.CVAEPolicyPixels(
          state_dim,
          action_spec,
          action_dim * 2,
          encoder=actor_kwargs['encoder'])

    self.action_dim = action_dim
    self.state_dim = state_dim

    if self.discrete_actions:
      self.action_encoder = tf.keras.Sequential(
          [
              tf.keras.layers.Dense(
                  state_dim, use_bias=True
              ),  # , kernel_regularizer=tf.keras.regularizers.l2(WEIGHT_DECAY)
              tf.keras.layers.ReLU(),
              # tf.keras.layers.BatchNormalization(),
              tf.keras.layers.Dense(action_dim)
          ],
          name='action_encoder')
      dummy_psi_act = tf.constant(np.zeros(shape=[1, state_dim]))
      self.action_encoder(dummy_psi_act)
    else:
      self.action_encoder = None

    self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

    self.critic_learner = critic.CriticLearner(
        state_dim,
        action_dim,
        critic_lr,
        discount,
        tau,
        encoder=critic_kwargs['encoder'],
        encoder_target=critic_kwargs['encoder_target'])

    self.bc = None
    self.threshold = 0.3

    self.model_dict = {
        'critic_learner': self.critic_learner,
        'action_encoder': self.action_encoder,
        'actor': self.actor,
        'actor_optimizer': self.actor_optimizer
    }
Exemple #5
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 target_update_period=1,
                 target_entropy=0.0,
                 cross_norm=False,
                 pcl_actor_update=False):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_update_period: Target network update period.
      target_entropy: Target entropy.
      cross_norm: Whether to fit cross norm critic.
      pcl_actor_update: Whether to use PCL actor update.
    """
        actor_kwargs = {}
        critic_kwargs = {}

        if len(observation_spec.shape) == 3:  # Image observations.
            # DRQ encoder params.
            # https://github.com/denisyarats/drq/blob/master/config.yaml#L73
            state_dim = 50

            # Actor and critic encoders share conv weights only.
            conv_stack = ConvStack(observation_spec.shape)

            actor_kwargs['encoder'] = ImageEncoder(conv_stack,
                                                   state_dim,
                                                   bprop_conv_stack=False)
            actor_kwargs['hidden_dims'] = (1024, 1024)

            critic_kwargs['encoder'] = ImageEncoder(conv_stack,
                                                    state_dim,
                                                    bprop_conv_stack=True)
            critic_kwargs['hidden_dims'] = (1024, 1024)

            if not cross_norm:
                # Note: the target critic does not share any weights.
                critic_kwargs['encoder_target'] = ImageEncoder(
                    ConvStack(observation_spec.shape),
                    state_dim,
                    bprop_conv_stack=True)

        else:  # 1D state observations.
            assert len(observation_spec.shape) == 1
            state_dim = observation_spec.shape[0]

        if cross_norm:
            beta_1 = 0.0
        else:
            beta_1 = 0.9

        self.actor = policies.DiagGuassianPolicy(state_dim, action_spec,
                                                 **actor_kwargs)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,
                                                        beta_1=beta_1)

        self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr,
                                                        beta_1=beta_1)

        if cross_norm:
            assert 'encoder_target' not in critic_kwargs
            self.critic_learner = critic.CrossNormCriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau,
                **critic_kwargs)
        else:
            self.critic_learner = critic.CriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau,
                target_update_period, **critic_kwargs)

        self.target_entropy = target_entropy
        self.discount = discount

        self.pcl_actor_update = pcl_actor_update