def _build_net(self): # inputs self.s1 = tf.placeholder(tf.float32, [None, self.feature_size[0], self.feature_size[1], self.feature_size[2]], name="s1") self.s2 = tf.placeholder(tf.float32, [None, self.sensor_size], name="s2") self.s1_ = tf.placeholder(tf.float32, [None, self.feature_size[0], self.feature_size[1], self.feature_size[2]], name="s_") self.s2_ = tf.placeholder(tf.float32, [None, self.sensor_size], name="s2_") self.r = tf.placeholder(tf.float32, [None, ], name='r') # input Reward self.a = tf.placeholder(tf.int32, [None, ], name='a') # input Action self.end = tf.placeholder(tf.float32, [None, ], name='end') # ------------------ build Q Network ------------------ # Evaluate Net self.q_eval = models.QNetwork(self.s1, self.s2, self.n_actions, 'Qnet_eval', reuse=False) with tf.variable_scope('q_eval'): a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) # shape=(None, ) # Target Net self.q_next = models.QNetwork(self.s1_, self.s2_, self.n_actions, 'Qnet_target', reuse=False) with tf.variable_scope("q_target"): q_target = self.r + self.end * self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_') # shape=(None, ) self.q_target = tf.stop_gradient(q_target) # Loss & Train with tf.variable_scope('loss'): self.td_error = tf.sqrt(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error')) self.loss = tf.reduce_mean(self.td_error) with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = models.QNetwork(state_size, action_size, seed, fc1_units=2 * state_size).to(device) self.qnetwork_target = models.QNetwork(state_size, action_size, seed, fc1_units=2 * state_size).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = replay_buffer.ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 print("The network used for the Simple Double Q-Learning agent:") print(self.qnetwork_local)
def __init__(self, obs_dim, actions_list, seed=0, params=None, logger=None): """ Initialize a Deep Q-Network agent. Parameters ---------- obs_dim: tuple Dimension of observations. actions_list: list List of possible actions. seed: Random seed. params: dict DQN hyperparameters. logger: Logger object. """ if params is None: params = dqn_params() self.params = params if not torch.cuda.is_available() and self.params['device'] != 'cpu': print("GPU is not available. Selecting CPU...") self.params['device'] = 'cpu' # initialize agent parameters self.obs_dim = obs_dim self.actions = actions_list self.num_act = len(actions_list) self.step_count = 0 # logger for storing training data self.logger = logger # set the random seed self.seed = torch.manual_seed(seed) # create local and target Q networks self.Qnet = models.QNetwork(self.obs_dim, self.num_act, seed).to(self.params['device']) self.target_Qnet = models.QNetwork(self.obs_dim, self.num_act, seed).to(self.params['device']) self.target_Qnet.load_state_dict( self.Qnet.state_dict()) # copy network weights to make identical # initialize optimizer self.optimizer = optim.Adam(self.Qnet.parameters(), lr=self.params['lr']) # initialize experience buffer self.buffer = utils.ExperienceBuffer( obs_dim, max_len=self.params['buffer_size'])
def _build_model(self): acts = dict(elu=tf.nn.elu, relu=tf.nn.relu, swish=tf.nn.swish, leaky_relu=tf.nn.leaky_relu) cnn_act = acts[self._c.cnn_act] act = acts[self._c.dense_act] self._encode = models.ConvEncoder(self._c.cnn_depth, cnn_act) self._dynamics = models.RSSM(self._c.stoch_size, self._c.deter_size, self._c.deter_size) self._decode = models.ConvDecoder(self._c.cnn_depth, cnn_act) self._contrastive = models.ContrastiveObsModel(self._c.deter_size, self._c.deter_size * 2) self._reward = models.DenseDecoder((), 2, self._c.num_units, act=act) if self._c.pcont: self._pcont = models.DenseDecoder((), 3, self._c.num_units, 'binary', act=act) self._value = models.DenseDecoder((), 3, self._c.num_units, act=act) self._Qs = [ models.QNetwork(3, self._c.num_units, act=act) for _ in range(self._c.num_Qs) ] self._actor = models.ActionDecoder(self._actdim, 4, self._c.num_units, self._c.action_dist, init_std=self._c.action_init_std, act=act) model_modules = [ self._encode, self._dynamics, self._contrastive, self._reward, self._decode ] if self._c.pcont: model_modules.append(self._pcont) Optimizer = functools.partial(tools.Adam, wd=self._c.weight_decay, clip=self._c.grad_clip, wdpattern=self._c.weight_decay_pattern) self._model_opt = Optimizer('model', model_modules, self._c.model_lr) self._value_opt = Optimizer('value', [self._value], self._c.value_lr) self._actor_opt = Optimizer('actor', [self._actor], self._c.actor_lr) self._q_opts = [ Optimizer('qs', [qnet], self._c.value_lr) for qnet in self._Qs ] if self._c.use_sac: self._sac = soft_actor_critic.SAC(self._actor, self._Qs, self._actor_opt, self._q_opts, self._actspace) self.train(next(self._dataset))