class Agent(): def __init__(self): self.dqn_local = DQN() self.batch_size = self.dqn_local.BATCH_SIZE print(self.dqn_local.dqn.summary()) self.dqn_target = DQN() self.replay_memory = ReplayMemory(self.dqn_local) self.temp = 0 def learn(self): if self.replay_memory.__len__() > self.batch_size: states, actions, rewards, next_states, dones = self.replay_memory.sample(self.dqn_local.INPUT_NODES) target = self.dqn_local.predict(states) target_val = self.dqn_target.predict(next_states) target_next = self.dqn_local.predict(next_states) max_action_values = np.argmax(target_next, axis = 1) for i in range(self.batch_size): if dones[i]: target[i][actions[i]] = rewards[i] else: temp2 = self.dqn_local.GAMMA * target_val[i][max_action_values[i]] target[i][actions[i]] = rewards[i] + temp2 self.dqn_local.train(states, target) if self.temp == self.dqn_local.UPDATE_RATE: self.update_target_weights() self.temp = 0 else: self.temp = self.temp + 1 def act(self, state, epsilon = 0): state = state.reshape((1,) + state.shape) action_values = self.dqn_local.predict(state) if random.random() > epsilon: action = np.argmax(action_values) else: action = random.randint(0, self.dqn_local.OUTPUT_NODES - 1) return action def experience(self, state, action, reward, next_state, done): self.replay_memory.memorize(state, action, reward, next_state, done) def update_target_weights(self): self.dqn_target.dqn.set_weights(self.dqn_local.dqn.get_weights()) def save(self): self.dqn_local.dqn.save('saved/snake_dqn_2.h5')
class DQN(object): """ A starter class to implement the Deep Q Network algorithm TODOs specify the main areas where logic needs to be added. If you get an error a Box2D error using the pip version try installing from source: > git clone https://github.com/pybox2d/pybox2d > pip install -e . """ def __init__(self, env): self.env = env tf.reset_default_graph() self.sess = tf.Session() # A few starter hyperparameters # hyperparameters self.gamma = 0.99 self.h1 = 64 self.h2 = 64 self.h3 = 64 self.l2_reg = 1e-6 self.max_episode_step = 1000 self.update_slow_target_every = 100 self.batch_size = 1024 self.eps_start = 1.0 self.epsilon_end = 0.05 self.epsilon_decay_length = 1e5 self.epsilon_decay_exp = 0.97 self.num_episodes = 0 self.num_steps = 0 self.epsilon_linear_step = ( self.eps_start - self.epsilon_end) / self.epsilon_decay_length # memory self.replay_memory = ReplayMemory(1e6) # Perhaps you want to have some samples in the memory before starting to train? self.min_replay_size = 2000 # define yours training operations here... self.observation_input = tf.placeholder( tf.float32, shape=[None] + list(self.env.observation_space.shape)) self.target_input = tf.placeholder( dtype=tf.float32, shape=[None] + list(self.env.observation_space.shape) ) # input to slow target network with tf.variable_scope('q_network') as scope: self.q_values = self.build_model(self.observation_input) with tf.variable_scope('target_network') as scope: self.target_q_values = self.build_model(self.observation_input, False) self.q_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_network') self.q_target_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_network') # update values for slowly-changing target network to match current critic network update_slow_target_ops = [] for i, slow_target_var in enumerate(self.q_target_network_vars): update_slow_target_op = slow_target_var.assign( self.q_network_vars[i]) update_slow_target_ops.append(update_slow_target_op) self.update_slow_target_op = tf.group(*update_slow_target_ops, name='update_slow_target') # define your update operations here... self.saver = tf.train.Saver(tf.trainable_variables()) self.target = tf.placeholder(tf.float32, shape=[None]) self.actions = tf.placeholder(shape=[None], dtype=tf.int32) #Calculating the action q value is taken from https://github.com/dennybritz/reinforcement-learning/tree/master/DQN gather_indices = tf.range(self.batch_size) * tf.shape( self.q_values)[1] + self.actions self.action_predictions = tf.gather(tf.reshape(self.q_values, [-1]), gather_indices) self.loss = tf.losses.huber_loss( self.target, self.action_predictions ) #tf.squared_difference(self.target, self.action_predictions) #Adding a regularization term for the weights for var in self.q_network_vars: if not 'bias' in var.name: self.loss += self.l2_reg * 0.5 * tf.nn.l2_loss(var) #self.loss = (self.target-self.action_predictions)**2 #self.losses = tf.reduce_mean(self.loss) self.minimizer = tf.train.AdamOptimizer(learning_rate=1e-6).minimize( self.loss ) #tf.train.GradientDescentOptimizer(1e-5).minimize(self.losses) self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(LOGDIR) self.writer.add_graph(self.sess.graph) self.count = 0 # Summaries for Tensorboard tf.summary.scalar("loss", self.loss) #tf.summary.scalar("loss_hist", self.losses), tf.summary.histogram("q_values_hist", self.q_values), tf.summary.scalar("max_q_value", tf.reduce_max(self.q_values)) self.summ = tf.summary.merge_all() def build_model(self, observation_input, trainable=True, scope='train'): """ TODO: Define the tensorflow model Hint: You will need to define and input placeholder and output Q-values. Currently returns an op that gives all zeros. """ hidden = tf.layers.dense(observation_input, self.h1, activation=tf.nn.relu, trainable=trainable, name='dense') hidden_2 = tf.layers.dense(hidden, self.h2, activation=tf.nn.relu, trainable=trainable, name='dense_1') hidden_3 = tf.layers.dense(hidden_2, self.h3, activation=tf.nn.relu, trainable=trainable, name='dense_2') action_values = tf.squeeze( tf.layers.dense(hidden_3, self.env.action_space.n, trainable=trainable, name="qValueLayer")) return action_values def select_action(self, obs, evaluation_mode=False): if np.random.uniform(0, 1) < self.eps_start and not evaluation_mode: finalAction = env.action_space.sample() return finalAction obs = np.reshape(obs, [1, self.env.observation_space.shape[0]]) output = self.sess.run(self.q_values, feed_dict={self.observation_input: obs}) finalAction = np.argmax(output) return finalAction def update(self): """ TODO: Implement the functionality to update the network according to the Q-learning rule """ if self.replay_memory.__len__() < self.min_replay_size: return else: new_samples = self.replay_memory.sample(self.batch_size) obs, action, next_obs, reward, done = zip(*new_samples) targets = np.zeros(self.batch_size) nextObsQValue = self.sess.run( self.target_q_values, feed_dict={self.observation_input: np.array(next_obs)}) for i, sample in enumerate(new_samples): #currentQValue[i] = qValues[i][sample.action] if sample.terminal: targets[i] = sample.reward else: targets[i] = np.max( nextObsQValue[i]) * self.gamma + sample.reward _, summaries = self.sess.run( [self.minimizer, self.summ], feed_dict={ self.target: targets, self.observation_input: np.array(obs), self.actions: np.array(action) }) self.writer.add_summary(summaries, self.count) self.count += 1 def train(self): """ The training loop. This runs a single episode. TODO: Implement the following as desired: 1. Storing transitions to the ReplayMemory 2. Updating the network at some frequency 3. Backing up the current parameters to a reference, target network """ done = False obs = env.reset() for i in xrange(self.max_episode_step): if done: break action = self.select_action(obs, evaluation_mode=False) next_obs, reward, done, info = env.step(action) self.replay_memory.push(obs, action, next_obs, reward, done) obs = next_obs self.num_steps += 1 self.update() #Start with an initial linear decay and after some time use exponential decay if self.num_steps < self.epsilon_decay_length: self.eps_start -= self.epsilon_linear_step elif done: self.eps_start = self.eps_start * self.epsilon_decay_exp self.num_episodes += 1 #After every episode decreasing the epsilon if self.num_episodes % self.update_slow_target_every == 0: self.sess.run(self.update_slow_target_op) def eval(self, save_snapshot=True): """ Run an evaluation episode, this will call """ total_reward = 0.0 ep_steps = 0 done = False obs = env.reset() while not done: env.render() action = self.select_action(obs, evaluation_mode=True) obs, reward, done, info = env.step(action) total_reward += reward print("Evaluation episode: ", total_reward) print(str(self.eps_start) + '-----' + str(self.num_episodes)) if save_snapshot: print("Saving state with Saver") self.saver.save(self.sess, 'models/dqn-model', global_step=self.num_episodes) def fc_layer(self, input, size_in, size_out, name="fc"): with tf.name_scope(name): w = tf.Variable(tf.truncated_normal([size_in, size_out]), name="W") b = tf.Variable(tf.constant(0.0, shape=[size_out]), name="B") act = tf.matmul(input, w) + b tf.summary.histogram("weights", w) tf.summary.histogram("biases", b) tf.summary.histogram("activations", act) return act