class DDPG(object): def __init__(self, nb_states, nb_actions, args): self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = args.discrete net_config = { 'hidden1' : args.hidden1, 'hidden2' : args.hidden2 } # Actor and Critic initialization self.actor = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # Replay Buffer and noise self.memory = ReplayBuffer(args.memory_size) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)) self.last_state = None self.last_action = None # Hyper parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount # CUDA self.use_cuda = args.cuda if self.use_cuda: self.cuda() def cuda(self): self.actor.to(device) self.actor_target.to(device) self.critic.to(device) self.critic_target.to(device) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def reset(self, obs): self.last_state = obs self.noise.reset() def observe(self, reward, state, done): self.memory.append([self.last_state, self.last_action, reward, state, done]) self.last_state = state def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.last_action = action return action.argmax() if self.discrete else action def select_action(self, state, apply_noise=False): self.eval() action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0) self.train() if apply_noise: action = action + self.noise.sample() action = np.clip(action, -1., 1.) self.last_action = action #print('action:', action, 'output:', action.argmax()) return action.argmax() if self.discrete else action def update_policy(self): state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) state = to_tensor(np.array(state_batch), device=device) action = to_tensor(np.array(action_batch), device=device) next_state = to_tensor(np.array(next_state_batch), device=device) # compute target Q value next_q_value = self.critic_target([next_state, self.actor_target(next_state)]) target_q_value = to_tensor(reward_batch, device=device) \ + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value # Critic and Actor update self.critic.zero_grad() with torch.set_grad_enabled(True): q_values = self.critic([state, action]) critic_loss = criterion(q_values, target_q_value.detach()) critic_loss.backward() self.critic_optim.step() self.actor.zero_grad() with torch.set_grad_enabled(True): policy_loss = -self.critic([state.detach(), self.actor(state)]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean()) def save_model(self, output, num=1): if self.use_cuda: self.actor.to(torch.device("cpu")) self.critic.to(torch.device("cpu")) torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.to(device) self.critic.to(device) def load_model(self, output, num=1): self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) if self.use_cuda: self.cuda()
def train(sess, env, actor, critic, RESTORE): sess.run(tf.global_variables_initializer()) # Initialize random noise generator exploration_noise = OUNoise(env.action_space.n) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay buffER replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # Store q values for illustration purposes q_max_array = [] reward_array = [] for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(MAX_EP_STEPS): # if i % 40 == 0 and i > 1: # env.render() # Begin "Experimentation and Evaluation Phase" # Seleect next experimental action by adding noise to action prescribed by policy a = actor.predict(np.reshape(s, (1, actor.s_dim))) # If in a testing episode, do not add noise # if i%100 is not 49 and i%100 is not 99: noise = exploration_noise.noise() a = a + noise # Take step with experimental action action = np.argmax(a) s2, r, terminal, info = env.step(action) # s2, r, terminal, info = env.step(np.reshape(a.T,newshape=(env.action_space.n,))) # Add transition to replay buffer if not testing episode # if i%100 is not 49 and i%100 is not 99: replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Find target estimate to use for updating the Q-function # Predict_traget function determines Q-value of next state target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1))) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Perform gradient descent to update critic predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value, axis=0) # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r # If episode is finished, print results if terminal: print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) q_max_array.append(ep_ave_max_q / float(j)) #reward_array.append(ep_reward) break ep_reward = 0 s = env.reset() for j in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Take step with experimental action action = np.argmax(a) s2, r, terminal, info = env.step(action) ep_reward += r s = s2 if terminal: print('Normal | Reward: %.2i' % int(ep_reward), " | Episode", i) reward_array.append(ep_reward) break # Max Q plot plt.plot(range(1, MAX_EPISODES + 1), q_max_array, 'b-') plt.xlabel('Episode Number') plt.ylabel('Max Q-Value') plt.savefig('Q.png') plt.show() # Reward plot plt.plot(range(1, MAX_EPISODES + 1), reward_array, 'g-') plt.xlabel('Episode Number') plt.ylabel('Reward') plt.savefig('Reward.png') plt.show() save_result([[str(i[0]) for i in q_max_array], [str(i) for i in reward_array]])
def train(sess, env, actor, critic, RESTORE): sess.run(tf.global_variables_initializer()) # Initialize random noise generator exploration_noise = OUNoise(env.action_space.shape[0]) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay buffER replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) totSteps = 0 # Store q values for illustration purposes q_max_array = [] actor.learning_rate = MAX_ACTOR_LEARNING_RATE critic.learning_rate = MAX_CRITIC_LEARNING_RATE for i in xrange(MAX_EPISODES): s = env.reset() s = normalize(s) ep_reward = 0 ep_ave_max_q = 0 # update learning rates using cosine annealing T_cur = i % LR_CYCLE actor.learning_rate = MIN_ACTOR_LEARNING_RATE +\ 0.5 * (MAX_ACTOR_LEARNING_RATE - MIN_ACTOR_LEARNING_RATE) * \ (1 + np.cos(np.pi * T_cur / LR_CYCLE)) critic.learning_rate = MIN_CRITIC_LEARNING_RATE +\ 0.5 * (MAX_CRITIC_LEARNING_RATE - MIN_CRITIC_LEARNING_RATE) * \ (1 + np.cos(np.pi * T_cur / LR_CYCLE)) for j in xrange(MAX_EP_STEPS): totSteps += 1 # Begin "Experimentation and Evaluation Phase" # Select next experimental action by adding noise to action prescribed by policy a = actor.predict(np.reshape(s, (1, actor.s_dim, 1))) # If in a testing episode, do not add noise if i < EXPLORATION_SIZE and not (i % 100 is 49 or i % 100 is 99): noise = exploration_noise.noise() a = a + noise # Constrain action a = np.clip(a, -15, 15) # Take step with experimental action s2, r, terminal, info = env.step( np.reshape(a.T, newshape=(env.action_space.shape[0], )), CONST_THROTTLE) #print("car pos: " + str(env.car_dist_s)) #print("action: " + str(a)) #print("reward: " + str(r)) s2 = normalize(s2) # Add transition to replay buffer if not testing episode if i % 100 is not 49 and i % 100 is not 99: replay_buffer.add(np.reshape(s, (actor.s_dim, 1)), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, 1))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MEMORY_WARMUP: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Find target estimate to use for updating the Q-function # Predict_traget function determines Q-value of next state target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1))) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Perform gradient descent to update critic predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value, axis=0) # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r # If episode is finished, print results if terminal: if i % 100 is 49 or i % 100 is 99: print("Testing") kmodel = Sequential() actVars = [] for var in tf.trainable_variables(): if 'non-target' in str(var): actVars.append(var) kmodel.add( Dense(units=l1size, activation='tanh', weights=[ sess.run(actVars[0]), sess.run(actVars[1]) ], input_dim=actor.s_dim)) kmodel.add( Dense(units=l2size, activation='tanh', weights=[ sess.run(actVars[2]), sess.run(actVars[3]) ])) kmodel.add( Dense(units=1, activation='tanh', weights=[ sess.run(actVars[4]), sess.run(actVars[5]) ])) optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06) kmodel.compile(loss="mse", optimizer=optimizer) kmodel.save(modelfile) else: print("Training") print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) q_max_array.append(ep_ave_max_q / float(j)) print('Finished in ' + str(j) + ' steps') break plt.plot(q_max_array) plt.xlabel('Episode Number') plt.ylabel('Max Q-Value') plt.show() kmodel = Sequential() actVars = [] for var in tf.trainable_variables(): if 'non-target' in str(var): actVars.append(var) kmodel.add( Dense(units=l1size, activation='tanh', weights=[sess.run(actVars[0]), sess.run(actVars[1])], input_dim=actor.s_dim)) kmodel.add( Dense(units=l2size, activation='tanh', weights=[sess.run(actVars[2]), sess.run(actVars[3])])) kmodel.add( Dense(units=1, activation='tanh', weights=[sess.run(actVars[4]), sess.run(actVars[5])])) optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06) kmodel.compile(loss="mse", optimizer=optimizer) kmodel.summary() kmodel.save(modelfile)