Ejemplo n.º 1
0
    def _build_net(self):
        # inputs
        self.s1 = tf.placeholder(tf.float32, [None, self.feature_size[0], self.feature_size[1], self.feature_size[2]], name="s1")
        self.s2 = tf.placeholder(tf.float32, [None, self.sensor_size], name="s2")
        self.s1_ = tf.placeholder(tf.float32, [None, self.feature_size[0], self.feature_size[1], self.feature_size[2]], name="s_")
        self.s2_ = tf.placeholder(tf.float32, [None, self.sensor_size], name="s2_")
        self.r = tf.placeholder(tf.float32, [None, ], name='r')  # input Reward
        self.a = tf.placeholder(tf.int32, [None, ], name='a')  # input Action
        self.end = tf.placeholder(tf.float32, [None, ], name='end')
        
        # ------------------ build Q Network ------------------
        # Evaluate Net
        self.q_eval = models.QNetwork(self.s1, self.s2, self.n_actions, 'Qnet_eval', reuse=False)

        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)  # shape=(None, )

        # Target Net
        self.q_next = models.QNetwork(self.s1_, self.s2_, self.n_actions, 'Qnet_target', reuse=False)
        
        with tf.variable_scope("q_target"):
            q_target = self.r + self.end * self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_') # shape=(None, )
            self.q_target = tf.stop_gradient(q_target)

        # Loss & Train
        with tf.variable_scope('loss'):
            self.td_error = tf.sqrt(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
            self.loss = tf.reduce_mean(self.td_error)

        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
Ejemplo n.º 2
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = models.QNetwork(state_size,
                                              action_size,
                                              seed,
                                              fc1_units=2 *
                                              state_size).to(device)
        self.qnetwork_target = models.QNetwork(state_size,
                                               action_size,
                                               seed,
                                               fc1_units=2 *
                                               state_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = replay_buffer.ReplayBuffer(action_size, BUFFER_SIZE,
                                                 BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        print("The network used for the Simple Double Q-Learning agent:")
        print(self.qnetwork_local)
Ejemplo n.º 3
0
    def __init__(self,
                 obs_dim,
                 actions_list,
                 seed=0,
                 params=None,
                 logger=None):
        """
        Initialize a Deep Q-Network agent.

        Parameters
        ----------
        obs_dim: tuple
            Dimension of observations.
        actions_list: list
            List of possible actions.
        seed:
            Random seed.
        params: dict
            DQN hyperparameters.
        logger:
            Logger object.

        """

        if params is None:
            params = dqn_params()
        self.params = params

        if not torch.cuda.is_available() and self.params['device'] != 'cpu':
            print("GPU is not available. Selecting CPU...")
            self.params['device'] = 'cpu'

        # initialize agent parameters
        self.obs_dim = obs_dim
        self.actions = actions_list
        self.num_act = len(actions_list)
        self.step_count = 0

        # logger for storing training data
        self.logger = logger

        # set the random seed
        self.seed = torch.manual_seed(seed)

        # create local and target Q networks
        self.Qnet = models.QNetwork(self.obs_dim, self.num_act,
                                    seed).to(self.params['device'])
        self.target_Qnet = models.QNetwork(self.obs_dim, self.num_act,
                                           seed).to(self.params['device'])
        self.target_Qnet.load_state_dict(
            self.Qnet.state_dict())  # copy network weights to make identical

        # initialize optimizer
        self.optimizer = optim.Adam(self.Qnet.parameters(),
                                    lr=self.params['lr'])

        # initialize experience buffer
        self.buffer = utils.ExperienceBuffer(
            obs_dim, max_len=self.params['buffer_size'])
Ejemplo n.º 4
0
    def _build_model(self):
        acts = dict(elu=tf.nn.elu,
                    relu=tf.nn.relu,
                    swish=tf.nn.swish,
                    leaky_relu=tf.nn.leaky_relu)
        cnn_act = acts[self._c.cnn_act]
        act = acts[self._c.dense_act]
        self._encode = models.ConvEncoder(self._c.cnn_depth, cnn_act)
        self._dynamics = models.RSSM(self._c.stoch_size, self._c.deter_size,
                                     self._c.deter_size)
        self._decode = models.ConvDecoder(self._c.cnn_depth, cnn_act)
        self._contrastive = models.ContrastiveObsModel(self._c.deter_size,
                                                       self._c.deter_size * 2)
        self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act)
        if self._c.pcont:
            self._pcont = models.DenseDecoder((),
                                              3,
                                              self._c.num_units,
                                              'binary',
                                              act=act)
        self._value = models.DenseDecoder((), 3, self._c.num_units, act=act)
        self._Qs = [
            models.QNetwork(3, self._c.num_units, act=act)
            for _ in range(self._c.num_Qs)
        ]
        self._actor = models.ActionDecoder(self._actdim,
                                           4,
                                           self._c.num_units,
                                           self._c.action_dist,
                                           init_std=self._c.action_init_std,
                                           act=act)
        model_modules = [
            self._encode, self._dynamics, self._contrastive, self._reward,
            self._decode
        ]
        if self._c.pcont:
            model_modules.append(self._pcont)
        Optimizer = functools.partial(tools.Adam,
                                      wd=self._c.weight_decay,
                                      clip=self._c.grad_clip,
                                      wdpattern=self._c.weight_decay_pattern)
        self._model_opt = Optimizer('model', model_modules, self._c.model_lr)
        self._value_opt = Optimizer('value', [self._value], self._c.value_lr)
        self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr)
        self._q_opts = [
            Optimizer('qs', [qnet], self._c.value_lr) for qnet in self._Qs
        ]

        if self._c.use_sac:
            self._sac = soft_actor_critic.SAC(self._actor, self._Qs,
                                              self._actor_opt, self._q_opts,
                                              self._actspace)

        self.train(next(self._dataset))