class Agent(object): def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau): self.state_size = state_size self.action_size = action_size self.max_action = max_action self.critic_lr = c_lr self.actor_lr = a_lr self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr) self.actor_target_network.set_weights(self.actor_network.get_weights()) self.critic_target_network.set_weights( self.critic_network.get_weights()) self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr) self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr) self.replay_buffer = ReplayBuffer(1e6) self.MINIBATCH_SIZE = minibatch_size self.GAMMA = tf.cast(gamma, dtype=tf.float64) self.TAU = tau self.noise = OUNoise(self.action_size) def step(self, s, a, r, s_1, t, train=True): self.replay_buffer.add(s, a, r, s_1, t) if (train and self.replay_buffer.size() >= self.MINIBATCH_SIZE): minibatch = self.replay_buffer.sample_batch(self.MINIBATCH_SIZE) self.learn(minibatch) @tf.function def critic_train(self, minibatch): s_batch, a_batch, r_batch, s_1_batch, t_batch = minibatch mu_prime = self.actor_target_network(s_1_batch) q_prime = self.critic_target_network([s_1_batch, mu_prime]) ys = r_batch + self.GAMMA * (1 - t_batch) * q_prime with tf.GradientTape() as tape: predicted_qs = self.critic_network([s_batch, a_batch]) loss = (predicted_qs - ys) * (predicted_qs - ys) loss = tf.reduce_mean(loss) dloss = tape.gradient(loss, self.critic_network.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss, self.critic_network.trainable_weights)) def actor_train(self, minibatch): s_batch, _, _, _, _ = minibatch with tf.GradientTape() as tape: next_action = self.actor_network(s_batch) actor_loss = -tf.reduce_mean( self.critic_network([s_batch, next_action])) actor_grad = tape.gradient(actor_loss, self.actor_network.trainable_weights) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_network.trainable_weights)) def learn(self, minibatch): s, a, r, s_1, t = minibatch s = np.array(s, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.state_size) s = tf.convert_to_tensor(s) a = np.array(a, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.action_size) a = tf.convert_to_tensor(a) r = np.array(r, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1) s_1 = np.array(s_1, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.state_size) s_1 = tf.convert_to_tensor(s_1) t = np.array(t, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1) minibatch = (s, a, r, s_1, t) self.critic_train(minibatch) self.actor_train(minibatch) self.update_target_networks() def act(self, state, t=0): state = np.array(state).reshape(1, self.state_size) action = self.actor_network(state)[0] noisy = self.noise.get_action(action, t) return action, noisy def update_target_networks(self): self.actor_target_network.set_weights( np.array(self.actor_network.get_weights()) * self.TAU + np.array(self.actor_target_network.get_weights()) * (1 - self.TAU)) self.critic_target_network.set_weights( np.array(self.critic_network.get_weights()) * self.TAU + np.array(self.critic_target_network.get_weights()) * (1 - self.TAU))
# Select action randomly or according to policy if t < args.start_timesteps: action = env.sample(group_name) # print(f"Sampled action: {action}") else: action = (policy.select_action(np.array(state)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) # Perform action next_state, rewards, done = env.step(action, group_name) reward, Rsim, Robs, Rcstr = unpack_rewards(rewards) # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, float(done)) state = next_state episode_reward += reward episode_Rsim += Rsim episode_Robs += Robs episode_Rcstr += Rcstr # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print(f"Total T: {t+1} Episode Num: {episode_num+1} \ Episode T: {episode_timesteps} Reward: {episode_reward:.3f} \ Rsim: {episode_Rsim:.3f} Robs: {episode_Robs:.3f} Rcstr: {episode_Rcstr:.3f} \
class Enemy(): def __init__(self, x, y, size, state_size, action_size, seed, mass=1): self.x = x self.y = y self.size = size self.colour = (0, 0, 255) self.thickness = 0 self.speed = 0 self.angle = 0 self.mass = mass self.drag = (self.mass / (self.mass + Constants.MASS_OF_AIR))**self.size #################################### self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(Constants.DEVICE) self.qnetwork_target = QNetwork(state_size, action_size).to(Constants.DEVICE) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=Constants.LR) # Replay memory self.memory = ReplayBuffer(action_size, Constants.BUFFER_SIZE, Constants.BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ####################################### def display(self, screen): pygame.draw.circle(screen, self.colour, (int(self.x), int(self.y)), self.size, self.thickness) def move(self): self.x += math.sin(self.angle) * self.speed self.y -= math.cos(self.angle) * self.speed self.speed *= self.drag def bounce(self, soccerfield): if self.x > Constants.SIZE_WIDTH - self.size: self.x = 2 * (Constants.SIZE_WIDTH - self.size) - self.x self.angle = -self.angle self.speed *= Constants.ELASTICITY elif self.x < self.size: self.x = 2 * self.size - self.x self.angle = -self.angle self.speed *= Constants.ELASTICITY if self.y > Constants.SIZE_HEIGHT - self.size: self.y = 2 * (Constants.SIZE_HEIGHT - self.size) - self.y self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif self.y < self.size: self.y = 2 * self.size - self.y self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY if self.x > int((19 * Constants.SIZE_WIDTH) / 20): if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3): self.y = 2 * (Constants.SIZE_HEIGHT / 3 - self.size) - self.y - 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3): self.y = 2 * self.size - self.y + 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif self.x < int(Constants.SIZE_WIDTH / 20): if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3): self.y = 2 * (Constants.SIZE_HEIGHT / 3 - self.size) - self.y - 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3): self.y = 2 * self.size - self.y + 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY for i in range(4): dx = self.x - soccerfield.goalposts[i].x dy = self.y - soccerfield.goalposts[i].y dist = math.hypot(dx, dy) if dist < self.size + soccerfield.goalposts[i].size: angle = math.atan2(dy, dx) + 0.5 * math.pi total_mass = self.mass + 9999 (self.angle, self.speed) = self.addVectors( self.angle, self.speed * (self.mass - 9999) / total_mass, angle, 0) self.speed *= Constants.ELASTICITY overlap = 0.5 * (self.size + soccerfield.goalposts[i].size - dist + 1) self.x += math.sin(angle) * overlap self.y -= math.cos(angle) * overlap break ''' 0 -> shoot 1 -> up + left 2 -> up + right 3 -> down + left 4 -> down + right 5 -> up 6 -> down 7 -> left 8 -> right ''' def update(self, action, ball): if action == 0 and self.control_ball(ball): dx = -(self.x - ball.x) / 6 dy = -(self.y - ball.y) / 6 ball.angle = 0.5 * math.pi + math.atan2(dy, dx) ball.speed = math.hypot(dx, dy) if action == 1: dx = -Constants.UPDATE_DOUBLE_DXY dy = -Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 2: dx = Constants.UPDATE_DOUBLE_DXY dy = -Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 3: dx = -Constants.UPDATE_DOUBLE_DXY dy = Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 4: dx = Constants.UPDATE_DOUBLE_DXY dy = Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 5: dx = 0 dy = -Constants.UPDATE_SINGLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 6: dx = 0 dy = Constants.UPDATE_SINGLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 7: dx = -Constants.UPDATE_SINGLE_DXY dy = 0 self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 8: dx = Constants.UPDATE_SINGLE_DXY dy = 0 self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) def control_ball(self, ball): dx = self.x - ball.x dy = self.y - ball.y dist = math.hypot(dx, dy) if dist - 3 < self.size + ball.size: return True return False def addVectors(self, angle1, length1, angle2, length2): x = math.sin(angle1) * length1 + math.sin(angle2) * length2 y = math.cos(angle1) * length1 + math.cos(angle2) * length2 angle = 0.5 * math.pi - math.atan2(y, x) length = math.hypot(x, y) return (angle, length) ################### def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % Constants.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > Constants.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, Constants.GAMMA) def act(self, state, eps=0.): # Returns actions for given state as per current policy. state = torch.from_numpy(state).float().unsqueeze(0).to( Constants.DEVICE) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, Constants.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class AgentDDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Noise process self.mu = 0 self.theta = 0.15 self.sigmaStart = 0.5 self.sigmaEnd = 0.1 self.decayExponent = 0.01 self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigmaStart, self.sigmaEnd, self.decayExponent) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.0001 # for soft update of target parameters self.learningRateActor = 0.00005 self.learningRateCritic = 0.0005 self.dropoutActor = 0.1 self.dropoutCritic = 0.1 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) self.critic_target = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.rewardSum = 0 def reset_episode(self): self.rewardSum = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.rewardSum += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] noise = self.noise.sample() return list(action + noise), noise # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def train(self, sess, actor, critic, actor_noise, buffer_size, minibatch_size): # Set up summary Ops summary_ops, summary_vars = self.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter("./results", sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(buffer_size), int(1234)) for i in range(self.max_episodes): # s = env.reset() self.pub2.publish() # print "reset called" ep_reward = 0 ep_ave_max_q = 0 for j in range(self.episode_length): if j == 0: # print "first round" s, R = self.getstate([0, 0]) R = 0 self.lstate = s print R continue # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) #********************************wait for state here for the first pass or initis ##*******************************interact here with environment #perform action and wait for new state # a=np.array([0.1,0]) a = actor.predict(np.reshape( s, (1, actor.s_dim))) + actor_noise() # print a[0] s2, R = self.getstate(a[0]) print R replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), R, self.terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(minibatch_size): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(minibatch_size)) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(minibatch_size)): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(minibatch_size), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # s = s2 s = s2 self.lstate = s ep_reward += R if self.terminal == 1: self.terminal = 0 # print "terminal!!!!!!!!!" summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
class stateMsg(): def __init__(self): # self.moveNet = moveCNN() self.state = np.zeros(14); self.lstate = np.zeros(14); self.stateT=torch.FloatTensor() self.sub = rospy.Subscriber('/state', Floats, self.fetch) self.pub = rospy.Publisher('/cmd_vel_mux/input/navi',Twist,queue_size=10) self.fpass=1 self.move_cmd = Twist() self.move_cmd.linear.x = 0 self.move_cmd.angular.z = 0 self.max_episodes=20000 self.episode_length=20 #maybe change later self.num_episodes=0 self.terminal=0 self.rBuf=ReplayBuffer() def train(self,states): states=torch.unsqueeze(states, 0) X = Variable(states.clone().cpu()) actions=actor.forward(X) Q=critic.forward(X,actions) return Q, actions def fetch(self,msg): if self.num_episodes<self.max_episodes: self.state=np.array(msg.data) self.state=np.concatenate((self.state,np.array([self.move_cmd.linear.x,self.move_cmd.angular.z]))) self.stateT=torch.from_numpy(self.state).type(dtype) # Q,actions=self.train(self.stateT) states=torch.unsqueeze(self.stateT, 0) X = Variable(states.clone().cpu()) print X actions=actor.forward(X) action=actions.data.numpy() self.move_cmd.linear.x = action[0][0] self.move_cmd.angular.z = action[0][1] self.pub.publish(self.move_cmd) if self.fpass==0: R=self.reward() self.rBuf.add(self.lstate,action,R,self.terminal,self.state) if self.rBuf.size()>5: s_batch, a_batch, r_batch, t_batch, s2_batch = self.rBuf.sample_batch(5) # Q=critic.forward(X,actions) if self.fpass==1: self.fpass=0 self.lstate=self.state def reward(self): dist=self.state[10] ldist=self.lstate[10] # print dist if dist<0.2: R=10 self.terminal=1 self.num_episodes+=1 elif dist==1234: R=-100 self.terminal=1 self.num_episodes+=1 # print "hit" else: R=0.1*(ldist-dist) return R
class DDPGController(object): """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.state_dim self.action_dim = env.action_dim self.sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=True)) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.model_saver = tf.train.Saver() def train(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def initial_train(self, mini_batch): state_batch = np.asarray([data[0] for data in mini_batch]) action_batch = np.asarray([data[1] for data in mini_batch]) action_label_batch = np.asarray([data[2] for data in mini_batch]) value_label_batch = np.asarray([data[3] for data in mini_batch]) done_batch = np.asarray([data[4] for data in mini_batch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) action_label_batch = np.resize(action_label_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch y_batch = [] for i in range(len(mini_batch)): y_batch.append(value_label_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L critic_cost = self.critic_network.train(y_batch, state_batch, action_label_batch) # Update the actor policy using the sampled gradient: # action_batch_for_gradients = self.actor_network.actions(state_batch) # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients) # self.actor_network.train(q_gradient_batch, state_batch) action_cost = self.actor_network.initial_train( action_label_batch=action_label_batch, state_batch=state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() return critic_cost, action_cost def save_model(self, path, check_point): self.model_saver.save(self.sess, path + 'DDPGControllerModel.ckpt', global_step=check_point) print("Model saved at " + path + 'model.ckpt') def load_model(self, path): self.model_saver.restore(self.sess, path) print("Model loaded at " + path) pass
class Agent: """ Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, batch_size, buffer_size, gamma, lr): """ Initialize an Agent. @param state_size: (int) dimension of each state (= n) @param action_size: (int) dimension of each action (= n), select maximum as action @param batch_size: (int) mini-batch size @param buffer_size: replay-buffer size @param gamma: discount factor @param lr: learning rate """ self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.state_goal_size = 2 * state_size # state+goal = 2n self.action_size = action_size self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.lr = lr # Q-Network self.qnetwork_local = QNetwork(self.state_goal_size, action_size).to(self.device) self.qnetwork_target = QNetwork(self.state_goal_size, action_size).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size) def store_episode(self, states, actions, rewards, next_states, dones): """ Store episode to replay buffer for standard experience replay. @param states: (list of dicts) containing 'obs' and 'goal' (is stored as s||g in memory) @param actions: list of actions in episode @param rewards: list of rewards received in episode @param next_states: list of next states (is stored as ns||g in memory) @param dones: boolean indicating end of episode """ # normal experience replay, store experiences state_goals = [np.concatenate([i['obs'], i['goal']]) for i in states] next_state_goals = [ np.concatenate([i['obs'], i['goal']]) for i in next_states ] for (sg, a, r, nsg, d) in zip(state_goals, actions, rewards, next_state_goals, dones): self.memory.add(sg, a, r, nsg, d) def store_episode_HER(self, states, actions, next_states, replay_strategy='final', k=4): """ Store episode with HER samples if replay_strategy is set to 'final', 'future' or 'episode'. @param states: (list of dicts) containing 'obs' and 'goal' (is stored as s||g in memory) @param actions: list of actions @param next_states: list of next states (is stored as ns||g in memory) @param replay_strategy: if 'future' HER samples are added to the buffer @param k: number of goals in one episode for HER """ T = len(actions) n_bits = len(states[0]['obs']) if replay_strategy is 'final': # HER 'final' replay strategy --------------------------------------------------------- # substitute goal as final state of episode goal_her = next_states[-1]['obs'] for t in range(T): state_goal = np.concatenate((states[t]['obs'], goal_her)) next_state_goal = np.concatenate( (next_states[t]['obs'], goal_her)) # recompute reward and done done = np.sum( np.array(next_states[t]['obs']) == np.array( goal_her)) == n_bits reward = 0 if done else -1 self.memory.add(state_goal, actions[t], reward, next_state_goal, done) if replay_strategy is 'future': # HER 'future' replay strategy --------------------------------------------------------- for t in range(T): for k in range(k): future_idx = np.random.randint( t, T ) # select random index from future experience in episode # set goal as next_state from future index goal_her = next_states[future_idx]['obs'] state_goal = np.concatenate([states[t]['obs'], goal_her]) next_state_goal = np.concatenate( [next_states[t]['obs'], goal_her]) # recompute reward and done done = np.sum( np.array(next_states[t]['obs']) == np.array( goal_her)) == n_bits reward = 0 if done else -1 self.memory.add(state_goal, actions[t], reward, next_state_goal, done) if replay_strategy is 'episode': # HER 'episode' replay strategy --------------------------------------------------------- for t in range(T): for k in range(k): episode_idx = np.random.randint( 0, T) # select random index from current episode # set goal as random (next) state in episode goal_her = next_states[episode_idx]['obs'] state_goal = np.concatenate([states[t]['obs'], goal_her]) next_state_goal = np.concatenate( [next_states[t]['obs'], goal_her]) # recompute reward and done done = np.sum( np.array(next_states[t]['obs']) == np.array( goal_her)) == n_bits reward = 0 if done else -1 self.memory.add(state_goal, actions[t], reward, next_state_goal, done) def act(self, state_goal, eps=0.): """ Returns actions for given state as per current policy @param state_goal: (array_like) current state @param eps: (float) epsilon, for epsilon-greedy action selection @return: (int) action is the index of the bit to flip, value in [0, n-1] """ state_goal = torch.from_numpy(state_goal).float().unsqueeze(0).to( self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state_goal) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """ Update value parameters using given batch of experience tuples.""" if len(self.memory) > self.batch_size: experiences = self.memory.sample() # compute and minimize the loss state_goals, actions, rewards, next_state_goals, dones = experiences # update rule Q_targets = rewards + \ self.gamma * self.qnetwork_target(next_state_goals).max(1)[0].unsqueeze(1) * (1 - dones) Q_expected = self.qnetwork_local(state_goals).gather(1, actions) # MSE loss loss = F.mse_loss(Q_expected, Q_targets) # optimization self.optimizer.zero_grad() loss.backward() self.optimizer.step() def soft_update(self, local_model, target_model, tau): """ Soft update model parameters: θ_target = τ*θ_local + (1 - τ)*θ_target @param local_model: local pytorch model @param target_model: target pytorch model @param tau: soft update of target network, 1-tau = polyak coefficient """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(sess, env, args, actor, critic, actor_noise): # Load ckpt file if args['load_ckpts']: print("Loading checkpoints") loader = tf.compat.v1.train.Saver() if args['ckpts_file'] is not None: ckpt = args['ckpts_dir'] + '/' + args['ckpts_file'] else: ckpt = tf.train.latest_checkpoint(args['ckpts_dir']) loader.restore(sess, ckpt) sys.stdout.write('%s restored.\n\n' % ckpt) sys.stdout.flush() ckpt_split = ckpt.split('-') train_ep = ckpt_split[-1] else: print("Starting new training") sess.run(tf.compat.v1.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() train_ep = 0 # Define saver for saving model ckpts model_name = str(env) + '.ckpt' checkpoint_path = os.path.join(args['ckpts_dir'], model_name) if not os.path.exists(args['ckpts_dir']): os.makedirs(args['ckpts_dir']) saver = tf.compat.v1.train.Saver() # Setup Summary summary_ops, summary_vars = build_summaries() # sess.run(tf.compat.v1.global_variables_initializer()) # Initialize target network weights # actor.update_target_network() # critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) for i in range(int(train_ep) + 1, int(args['max_episodes']) + 1): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Add exploration noise a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal, _ = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) # Find argmax q value of the current episode ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r csv_write = [i, ep_reward, ep_ave_max_q] if terminal: if (summary_ops != None): summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break if (i % int(args['ckpts_step']) == 0): saver.save(sess, checkpoint_path, i) sys.stdout.write('Checkpoint saved \n') sys.stdout.flush() with open('result/rewards.csv', mode='a', newline='') as output_file: output_writer = csv.writer(output_file, lineterminator='\n') output_writer.writerow(csv_write)
class Agent_DQN: def __init__(self, action_size, state_size, learning_rate=0.01, discount_factor=0.9, epsilon_initial=1, epsilon_decay=0.995, batch_size=32): # 생성자에 다양한 변수 지정하기 self.action_size = action_size self.state_size = state_size # LR 과 global step을 지정한다. self.global_step = tf.Variable(0, trainable=False) # decayed_learning_rate = learning_rate * # decay_rate ^ (global_step / decay_steps) self.learning_rate = tf.train.exponential_decay(learning_rate, self.global_step, 100, 0.9999, staircase=False, name='learning_rate') # Discount factor 도 지정해 준다. self.gamma = discount_factor # epsilon greedy 방식으로 탐험을 할 것이므로 epsilon 과 decay를 정해준다. self.epsilon = epsilon_initial self.epsilon_decay = epsilon_decay # 배치 사이즈 정하기 self.batch_size = batch_size self.learning_iteration = 0 # 메모리 정의해주기. 메모리에는 s,a,r,s_ 를 저장해야 한다. 따라서 s, s_ 저장공간, a, r을 위한 저장공간을 만든다. self.memory_size = 2000 self.replayBuffer = ReplayBuffer(self.memory_size) # 두가지 네트워크를 정의해서 하나는 Fixed Q-target으로 사용한다. self.build_evaluation_network() self.build_target_network() # target net과 eval net의 파라미터를 모아준다. scope의 상위 directory를 이용해서 모아줄 수 있다. t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tn') self.t_params = t_params e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='en') # tf assign을 이용하면 특정 텐서 변수값에 다른 하나를 넣을 수 있다. self.replace_target_op = [ tf.assign(t, (1 - 0.03) * t + 0.03 * e) for t, e in zip(t_params, e_params) ] # 세션을 정의한다. self.sess = tf.Session() # initializer 실행 self.sess.run(tf.global_variables_initializer()) self.loss_history = [] self.learning_rate_history = [] def build_evaluation_network(self): ''' eval net을 만들 땐 target net과는 다르게 loss를 구하는 net이 추가되어야 함. target net 은 fixed Q-target을 위해서 쓰는 것이지 업데이트를 하지 않는다. 때문에 이 eval net만 tarinable = Ture 로 설정되어야 함. :return: ''' # evaluation net 으로 들어갈 data 를 넣을 placeholder 이다. self.eval_input = tf.placeholder(tf.float32, [None, self.state_size], name='eval_input') # self.y 와 self.a 는 placeholder 로써, loss 를 구하기 위한 placeholder 이다. self.y = tf.placeholder(tf.float32, [None], name='Q_target') self.a = tf.placeholder(tf.int64, [None], name='action') # 실제 네트워크 with tf.variable_scope('en'): hidden1 = tf.layers.dense( self.eval_input, 10, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0., 0.5), bias_initializer=tf.random_normal_initializer(0., 0.1), name='layer1', trainable=True) self.q_eval = tf.layers.dense( hidden1, self.action_size, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0., 0.5), bias_initializer=tf.random_normal_initializer(0., 0.1), name='layer2', trainable=True) # loss를 구하는 부분 with tf.variable_scope('loss'): self.a_one_hot = tf.one_hot(self.a, depth=self.action_size) self.q_predict = tf.reduce_sum(tf.multiply(self.q_eval, self.a_one_hot), axis=1) self.loss = tf.reduce_mean( tf.squared_difference(self.y, self.q_predict)) with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.learning_rate)\ .minimize(self.loss, global_step=self.global_step) def build_target_network(self): self.target_input = tf.placeholder(tf.float32, [None, self.state_size], name='target_input') with tf.variable_scope('tn'): hidden1 = tf.layers.dense( self.target_input, 10, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0., 0.5), bias_initializer=tf.random_normal_initializer(0., 0.1), name='layer1', trainable=False) self.get_q_target = tf.layers.dense( hidden1, self.action_size, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0., 0.5), bias_initializer=tf.random_normal_initializer(0., 0.1), name='layer2', trainable=False) def store_transition(self, s, a, r, s_): self.replayBuffer.add(s, a, r, s_) def get_action(self, observation): ''' x : 카트 위치 dx/dt : 카트 속도 θ : 막대기 각도 dθ/dt : 각속도 이 함수는 epsilon 값에 따라 Neural Network 또는 임의의 값 하나를 action으로 선택하여 return 한다. ''' if np.random.uniform() > self.epsilon: actions_value = self.sess.run( self.q_eval, feed_dict={self.eval_input: [observation]}) action = np.argmax(actions_value) else: action = np.random.randint(0, self.action_size) return action def learn(self): ''' 인공신경망의 업데이트가 이루어지는 함수 ''' # 메모리를 적당히 채우면 learn 하고 그렇지 않으면 learn을 생략한다. if self.learning_iteration >= self.memory_size: # eval_net 과 fixed_q_target을 적절한 비율로 교체해준다. self.sess.run(self.replace_target_op) batch = self.replayBuffer.get_batch(self.batch_size) batch_s = np.asarray([x[0] for x in batch]) batch_a = np.asarray([x[1] for x in batch]) batch_r = np.asarray([x[2] for x in batch]) batch_s_ = np.asarray([x[3] for x in batch]) # q_eval 은 현재 Q함수값을 구하기 위해, get_q_target은 max함수에 포함되어있는 Q값을 구하기 위해 사용한다. get_q_target, q_eval = self.sess.run( [self.get_q_target, self.q_eval], feed_dict={ self.target_input: batch_s_, # fixed params self.eval_input: batch_s, # newest params }) # action 은 배치 메모리에서 state가 저장된 다음부분부터가 action이므로 그 값을 가져오면 된다. a = batch_a # reward는 action 다음에 저장했으므로 그 다음 값을 가져오면 된다. reward = batch_r # self.y placeholder에 넣어줄 값을 위에서 구한 값으로 적절히 만들어서 넣는다. _, self.loss_out = self.sess.run( [self._train_op, self.loss], feed_dict={ self.eval_input: batch_s, self.y: reward + self.gamma * np.max(get_q_target, axis=1), self.a: a }) self.loss_history.append(self.loss_out) # epsilon -greedy 탐험을 하기 위해 epsilon 값을 주기적으로 낮춰주어야한다. self.epsilon = self.epsilon * self.epsilon_decay # iteration을 세어주기 위한 변수, 러닝레이트 출력을 위해 히스토리에 하나씩 추가해본다. self.learning_iteration += 1 self.learning_rate_history.append(self.sess.run([self.learning_rate])) def plot_loss(self): # 파이썬에서 Times New Roman 글씨체를 이용하여 그래프를 출력할 수 있음! plt.title('History') ms = 0.1 me = 1 line_width = 0.5 plt.ylabel('Loss') plt.xlabel('Training steps') plt.plot(np.arange(len(self.loss_history)), self.loss_history, '--^', color='r', markevery=me, label=r'critic loss', lw=line_width, markersize=ms) plt.grid() ax = plt.subplot(111) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.ylim(0, 2) plt.show() def plot_reward(self, reward_history): plt.plot(np.arange(len(reward_history)), reward_history) plt.grid() plt.ylabel('Reward') plt.xlabel('Episodes') plt.show()
class Agent(): def __init__(self, q_network, buffer_size, batch_size, update_every, gamma, tau, lr, seed): self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.tau = tau self.lr = lr self.qnetwork_local = copy.deepcopy(q_network) self.qnetwork_target = copy.deepcopy(q_network) self.seed = random.seed(seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed) self.temperature = 1 self.t_step = 0 ######################## self.qnetwork_local = self.qnetwork_local.to(device) self.qnetwork_target = self.qnetwork_target.to(device) def get_Q(self, state): return self.qnetwork_local.Q(state) def reset_memory(self): self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed) def predict_option_termination(self, state, current_option): state = torch.tensor(state).float().to(device) state = self.qnetwork_local.state(state) termination = self.qnetwork_local.terminations(state).softmax(dim = -1) termination = termination[current_option] #termination = self.qnetwork_local.terminations(state)[current_option].sigmoid() option_termination = Bernoulli(termination).sample() Q = self.get_Q(state) next_option = Q.argmax(dim=-1) return bool(option_termination.item()), next_option.item() def get_terminations(self, state): return self.qnetwork_local.terminations(state).softmax(dim = -1) def greedy_option(self, state): state = to_tensor(state).to(device) state = self.qnetwork_local.state(state) Q = self.get_Q(state) return Q.argmax(dim=-1).item() def step(self, state, current_option, reward, next_state, done, logp, entropy): self.memory.add(state, current_option, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if (len(self.memory)) > BATCH_SIZE: if self.t_step == 0: actor_loss_val = actor_loss(state, current_option, logp, entropy, reward, done, next_state, self.qnetwork_local, self.qnetwork_target) loss = actor_loss_val samples = self.memory.sample() self.learn(samples, self.gamma, loss) # critic loss [td error] def act(self, state, eps, option, eval_mode = True, pa = []): state = to_tensor(state).to(device) state = self.qnetwork_local.state(state) logits = state @ self.qnetwork_local.options_W[option].to(device) + self.qnetwork_local.options_b[option].to(device) if eval_mode: for i in range(len(logits)): if i not in pa: logits[i] = - float("inf") action_dist = (logits / self.temperature).softmax(dim=-1) # high temp makes softmax output closer action_dist = Categorical(action_dist) # like multinomial dist action = action_dist.sample() if not pa : action = torch.randint(0, NUM_LINES*NUM_LINES,(1,)) logp = 0 entropy = 0 return action, logp, entropy #action = torch.argmax(logits) logp = action_dist.log_prob(action) entropy = action_dist.entropy() return action.item(), logp, entropy #for test you need to write in the script itself to choose epsilon option def learn(self, samples, gamma, loss): states, options, rewards, next_states, dones = samples critic_loss_val = critic_loss(self.qnetwork_local, self.qnetwork_target, samples) loss += critic_loss_val self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.qnetwork_local.to('cpu') self.qnetwork_target.to('cpu') for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(TAU*local_param.data + (1.0-TAU)*target_param.data) self.qnetwork_local.to(device) self.qnetwork_target.to(device) # return buffer_size, batch_size, update_every, gamma, tau def get_stats(self): return self.buffer_size, self.batch_size, self.update_every, self.gamma, self.tau, self.lr
class Agent(): def __init__(self, q_network, buffer_size, batch_size, update_every, gamma, tau, lr, seed): self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.tau = tau self.lr = lr self.qnetwork_local = copy.deepcopy(q_network) self.qnetwork_target = copy.deepcopy(q_network) self.seed = random.seed(seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed) self.t_step = 0 def reset_memory(self): self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if (len(self.memory)) > self.batch_size: samples = self.memory.sample() self.learn(samples, self.gamma) def act(self, state, eps=0, eval_mode = True, pa = []): state = torch.tensor(state).float() with torch.no_grad(): action_values = self.qnetwork_local(state).numpy() if eval_mode: if random.random() > eps: for i in range(len(action_values)): if i not in pa: action_values[i] = - float("inf") return np.argmax(action_values) else: return random.choice(pa) else: if random.random() > eps: return np.argmax(action_values) else: return random.choice(range(len(action_values))) def learn(self, samples, gamma): states, actions, rewards, next_states, dones = samples q_values_next_states = self.qnetwork_target.forward(next_states).max(dim=1)[0] # .unsqueeze(1) targets = rewards + (gamma * (q_values_next_states) * (1 - dones)) q_values = self.qnetwork_local.forward(states) actions = actions.view(actions.size()[0], 1) predictions = torch.gather(q_values, 1, actions).view(actions.size()[0]) loss = F.mse_loss(predictions, targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) # return buffer_size, batch_size, update_every, gamma, tau def get_stats(self): return self.buffer_size, self.batch_size, self.update_every, self.gamma, self.tau, self.lr
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, fc_units=FC_UNITS).to(device) self.qnetwork_target = QNetwork(state_size, action_size, fc_units=FC_UNITS).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """Collect experience and learn from it. Params ====== state (array_like): current state action(int): current action reward(float): current reward next_state(array_like): next state done (bool): is episode over? """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if np.random.random() > eps: return int(np.argmax(action_values.cpu().data.numpy())) else: return np.random.randint(self.action_size) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent_DDPG(object): def __init__( self, action_size, state_size, action_limit, ): self.memory_size = 10000 self.replayBuffer = ReplayBuffer(self.memory_size) self.sess = tf.Session() self.discount_factor = 0.9 self.action_variance = 3 self.critic_learning_rate = 0.001 self.actor_learning_rate = 0.002 self.batch_size = 32 self.action_size, self.state_size, self.action_limit = action_size, state_size, action_limit, self.input_state = tf.placeholder(tf.float32, [None, state_size], 's') self.input_state_ = tf.placeholder(tf.float32, [None, state_size], 's_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') with tf.variable_scope('Actor'): self.a = self.build_actor_network(self.input_state, scope='eval', trainable=True) a_ = self.build_actor_network(self.input_state_, scope='tar', trainable=False) with tf.variable_scope('Critic'): q_eval = self.build_critic_network(self.input_state, self.a, scope='eval', trainable=True) q_target = self.build_critic_network(self.input_state_, a_, scope='target', trainable=False) self.actor_evaluation_params = tf.get_collection( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') self.actor_target_params = tf.get_collection( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/tar') self.critic_evaluation_params = tf.get_collection( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval') self.critic_target_params = tf.get_collection( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/tar') self.replace = [ tf.assign(t, (1 - 0.01) * t + 0.01 * e) for t, e in zip( self.actor_target_params + self.critic_target_params, self.actor_evaluation_params + self.critic_evaluation_params) ] ''' dJ/dtheta = E[ dQ/dtheta ] ''' # Actor Loss 는 Q로부터 내려오는 값을 maximize 하면 된다(논문 참조) self.a_loss = tf.reduce_mean(q_eval) # maximize the q # Maximize Q 를 해야하므로 learning rate에 '-' 를 붙인다. self.atrain = tf.train.AdamOptimizer( -self.actor_learning_rate).minimize( tf.reduce_mean(q_eval), var_list=self.actor_evaluation_params) # self.c_train 을 호출할때 self.a 에 배치의 action을 넣게 된다. # Placeholder가 아닌 self.a 에 직접 값을 대입하는 것! # s a r s_ 를 이용해서 critic을 업데이트 하는데, 정석으로 구한 y가 트루 라벨, 뉴럴넷에 값을 넣고 나오는 것이 우리의 prediction이다. # True Label, y = r(s,u_t(s)) + gamma*Q(s_, u_t(s_)) q_true = self.R + self.discount_factor * q_target # Prediction, Q = q_eval # 우리가 mseLoss를 구하려면 q_eval을 구해야 하므로 self.input_state에 피딩을 해 주어야 함. # 또한 q_true 를 구하기 위해 self.R 과 q_target에 들어갈 self.input_state_ 도 피딩 해주어야 함. self.mseloss = tf.losses.mean_squared_error(labels=q_true, predictions=q_eval) # 이 부분은 오직 Critic net을 업데이트하기위한 Loss이다. 때문에 var_list를 Critic evaluation network로 지정해주어야한다. self.ctrain = tf.train.AdamOptimizer( self.critic_learning_rate).minimize( self.mseloss, var_list=self.critic_evaluation_params) # 네트워크를 만들고 항상 초기화를 해준다. self.sess.run(tf.global_variables_initializer()) self.actor_loss_history = [] self.critic_loss_history = [] def store_transition(self, s, a, r, s_): self.replayBuffer.add(s, a, r, s_) def choose_action(self, s): return np.clip( np.random.normal( self.sess.run(self.a, {self.input_state: s[np.newaxis, :]})[0], self.action_variance), -2, 2) def learn(self): if self.replayBuffer.count() > self.batch_size: self.action_variance *= .9995 self.sess.run(self.replace) batch = self.replayBuffer.get_batch(self.batch_size) batch_s = np.asarray([x[0] for x in batch]) batch_a = np.asarray([x[1] for x in batch]) batch_r = np.asarray([[x[2]] for x in batch]) batch_s_ = np.asarray([x[3] for x in batch]) actor_loss, _ = self.sess.run([self.a_loss, self.atrain], {self.input_state: batch_s}) critic_loss, _ = self.sess.run( [self.mseloss, self.ctrain], { self.input_state: batch_s, self.a: batch_a, self.R: batch_r, self.input_state_: batch_s_ }) self.actor_loss_history.append(actor_loss) self.critic_loss_history.append(critic_loss) def build_actor_network(self, s, scope, trainable): actor_hidden_size = 30 with tf.variable_scope(scope): hidden1 = tf.layers.dense(s, actor_hidden_size, activation=tf.nn.relu, name='l1', trainable=trainable) a = tf.layers.dense(hidden1, self.action_size, activation=tf.nn.tanh, name='a', trainable=trainable) return tf.multiply(a, self.action_limit, name='scaled_a') def build_critic_network(self, s, a, scope, trainable): with tf.variable_scope(scope): critic_hidden_size = 30 hidden1 = tf.layers.dense(s, critic_hidden_size, name='s1', trainable=trainable) \ + tf.layers.dense(a, critic_hidden_size, name='a1', trainable=trainable) \ + tf.get_variable('b1', [1, critic_hidden_size], trainable=trainable) hidden1 = tf.nn.relu(hidden1) return tf.layers.dense(hidden1, 1, trainable=trainable) def plot_loss(self): plt.title('history', fontsize=25) ms = 0.1 me = 1 line_width = 0.1 plt.ylabel('Loss') plt.xlabel('Training steps') actor_loss_mean = sum(self.actor_loss_history) / len( self.actor_loss_history) self.actor_loss_history /= actor_loss_mean critic_loss_mean = sum(self.critic_loss_history) / len( self.critic_loss_history) self.critic_loss_history /= critic_loss_mean plt.plot(np.arange(len(self.actor_loss_history)), self.actor_loss_history, '-p', color='b', markevery=me, label=r'actor loss', lw=line_width, markersize=ms) plt.plot(np.arange(len(self.critic_loss_history)), self.critic_loss_history, '--^', color='r', markevery=me, label=r'critic loss', lw=line_width, markersize=ms) plt.grid() ax = plt.subplot(111) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.ylim(0, 10) plt.show() def plot_reward(self, reward_history): plt.plot(np.arange(len(reward_history)), reward_history) plt.ylabel('Reward') plt.xlabel('Episodes') plt.grid() plt.show()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size random.seed(random_seed) self.device = Utils.getDevice() # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.steps = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.steps += 1 # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) self.steps = 0 def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): """Resets the noise. """ self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # self.update_critic(states, actions, rewards, next_states, dones, gamma) # ---------------------------- update actor ---------------------------- # self.update_actor(states) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update_critic(self, states, actions, rewards, next_states, dones, gamma): """ update critic Params ====== states: current state actions: actions to performe next_states: next state dones : episode finished """ # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def update_actor(self, states): """ update actor Params ====== states: current state """ # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step()