def __init__(self, task, exploration_mu=0, exploration_theta=0.15, exploration_sigma=0.2, tau=0.01): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = exploration_mu self.exploration_theta = exploration_theta self.exploration_sigma = exploration_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 128 # 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.85 # 0.99 # discount factor self.tau = tau # for soft update of target parameters ## self.total_reward = 0 self.best_score = -np.inf self.score = 0 self.count = 0
def train(sess, env, args, actor, critic, actor_noise): """ Agent Training :param sess: :param env: :param args: :param actor: :param critic: :param actor_noise: :return: """ # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
class Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker self.best_score = -np.inf self.score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0.0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state # Save experience / reward self.total_reward += reward self.count += 1 def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def __init__(self, state_size, action_size, actor, critic, action_low=-1.0, action_high=1.0, lrate_critic=10e-3, lrate_actor=10e-4, tau=0.001, buffer_size=1e5, batch_size=64, gamma=0.99, exploration_mu=0.0, exploration_theta=0.15, noise_decay=1., exploration_sigma=0.20, restore=None, weight_decay=0., update_every=1, update_repeat=1, seed=None): self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high self.seed = seed if seed else np.random.randint(100) self.lrate_critic = lrate_critic self.lrate_actor = lrate_actor self.tau = tau self.gamma = gamma self.restore = restore self.batch_size = int(batch_size) self.buffer_size = int(buffer_size) self.update_every = update_every self.device = torch.device(DEVICE) self.weight_decay = weight_decay self.update_repeat = update_repeat self.noise_decay = noise_decay # actors networks self.actor = actor(state_size, action_size, low=action_low, high=action_high, seed=self.seed) self.actor_target = actor(state_size, action_size, low=action_low, high=action_high, seed=self.seed) # critic networks self.critic = critic(state_size, action_size, seed=self.seed) self.critic_target = critic(state_size, action_size, seed=self.seed) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=DEVICE) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.critic_target.load_state_dict(checkpoint['critic']) # optimizer self.actor_opt = optim.Adam(self.actor.parameters(), lr=lrate_actor, weight_decay=self.weight_decay) self.critic_opt = optim.Adam(self.critic.parameters(), lr=lrate_critic, weight_decay=self.weight_decay) # noise self.noise = OUNoise(action_size, exploration_mu, exploration_theta, exploration_sigma) self.noise_scale = 1.0 # replay buffer self.replay_buffer = ReplayBuffer(self.device, self.buffer_size, self.batch_size) # reset agent for training self.reset_episode() self.it = 0
class Agent(): def __init__(self, state_size, action_size, actor, critic, action_low=-1.0, action_high=1.0, lrate_critic=10e-3, lrate_actor=10e-4, tau=0.001, buffer_size=1e5, batch_size=64, gamma=0.99, exploration_mu=0.0, exploration_theta=0.15, noise_decay=1., exploration_sigma=0.20, restore=None, weight_decay=0., update_every=1, update_repeat=1, seed=None): self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high self.seed = seed if seed else np.random.randint(100) self.lrate_critic = lrate_critic self.lrate_actor = lrate_actor self.tau = tau self.gamma = gamma self.restore = restore self.batch_size = int(batch_size) self.buffer_size = int(buffer_size) self.update_every = update_every self.device = torch.device(DEVICE) self.weight_decay = weight_decay self.update_repeat = update_repeat self.noise_decay = noise_decay # actors networks self.actor = actor(state_size, action_size, low=action_low, high=action_high, seed=self.seed) self.actor_target = actor(state_size, action_size, low=action_low, high=action_high, seed=self.seed) # critic networks self.critic = critic(state_size, action_size, seed=self.seed) self.critic_target = critic(state_size, action_size, seed=self.seed) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=DEVICE) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.critic_target.load_state_dict(checkpoint['critic']) # optimizer self.actor_opt = optim.Adam(self.actor.parameters(), lr=lrate_actor, weight_decay=self.weight_decay) self.critic_opt = optim.Adam(self.critic.parameters(), lr=lrate_critic, weight_decay=self.weight_decay) # noise self.noise = OUNoise(action_size, exploration_mu, exploration_theta, exploration_sigma) self.noise_scale = 1.0 # replay buffer self.replay_buffer = ReplayBuffer(self.device, self.buffer_size, self.batch_size) # reset agent for training self.reset_episode() self.it = 0 def reset_episode(self): self.noise.reset() def act(self, state, learn=True): if not learn: self.actor.eval() with torch.no_grad(): action = self.actor(self.tensor(state)).cpu().numpy() if learn: action += self.noise.sample() * self.noise_scale self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01) self.actor.train() return np.clip(action, self.action_low, self.action_high) def save(self, path): dirn = os.path.dirname(path) if not os.path.exists(dirn): os.mkdir(dirn) params = {} params['actor'] = self.actor.state_dict() params['critic'] = self.critic.state_dict() torch.save(params, path) def step(self, state, action, reward, next_state, done): #pylint: disable=line-too-long self.replay_buffer.add(state, action, reward, next_state, done) self.it += 1 if self.it < self.batch_size or self.it % self.update_every != 0: return for _ in range(self.update_repeat): self.learn() def learn(self): # learn from mini-batch of replay buffer state_b, action_b, reward_b, next_state_b, done_b = self.replay_buffer.sample( ) # calculate td target with torch.no_grad(): y_b = reward_b.unsqueeze(1) + self.gamma * \ self.critic_target(next_state_b, self.actor_target(next_state_b)) * (1-done_b.unsqueeze(1)) # update critic critic_loss = F.smooth_l1_loss(self.critic(state_b, action_b), y_b) self.critic.zero_grad() critic_loss.backward() self.critic_opt.step() # update actor action = self.actor(state_b) actor_loss = -self.critic(state_b, action).mean() self.actor.zero_grad() actor_loss.backward() self.actor_opt.step() # soft update networks # critic only if trained # actor always self.soft_update() def soft_update(self): """Soft update of target network θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def tensor(self, x): return torch.from_numpy(x).float().to(self.device)
def train(self): writer = tf.summary.FileWriter(self.summary_path, self.sess.graph) self.actor.update_target_network() self.critic.update_target_network() num_episode = self.config['episode'] batch_size = self.config['batch_size'] gamma = self.config['gamma'] self.buffer = ReplayBuffer(self.config['buffer_size'], np.random.seed(self.config['seed'])) reward_set = [] q_value_set = [] loss_set = [] for i in range(num_episode): info = self.env.reset() obs1, time_window, done = self.env.provider.f_step() ep_reward = 0 ep_ave_max_q = 0 ep_loss = 0 for j in range(self.config['steps']): action0_ = info["weight"] action0 = np.expand_dims(action0_, axis=0) state1 = np.expand_dims(obs1, axis=0) norm_state1 = normalize_state(state1) action = self.actor.predict(input_num = state1.shape[0], state = norm_state1, previous_action=action0) + self.actor_noise() # step forward reward, info, done = self.env.f_step(obs1, action[0]) state2 = np.expand_dims(info["obs2"], axis=0) norm_state2 = normalize_state(state2) # add to buffer: the normalize the price self.buffer.add(norm_state1, action, reward, done, action0, norm_state2) obs1 = info["obs2"] ep_reward += reward #if self.buffer.size() >= batch_size: if True: # batch update s_batch, a_batch, r_batch, t_batch, a0_batch, s2_batch = self.buffer.sample_batch(batch_size) # Calculate targets input_num = s2_batch.shape[0] target_action = self.actor.predict_target(input_num, s2_batch, a_batch) target_q = self.critic.predict_target(input_num, s2_batch, a_batch, target_action) y_i = [] for k in range(input_num): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + gamma * target_q[k]) # Update the critic given the targets predicted_q_value = np.reshape(y_i, (input_num, 1)) critic_loss,q_value = self.critic.train(input_num, s_batch, a0_batch, a_batch, predicted_q_value) ep_ave_max_q += np.amax(q_value) ep_loss += critic_loss # Update the actor policy using the sampled gradient a_out = self.actor.predict(input_num, s_batch, a0_batch) grads = self.critic.action_gradients(input_num, s_batch, a0_batch, a_out) self.actor.train(input_num, s_batch, a0_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() if done or j == self.config['steps'] - 1: summary_str = self.sess.run(self.summary_ops, feed_dict={self.summary_vars[0]: ep_reward, self.summary_vars[1]: ep_ave_max_q / float(j)}) writer.add_summary(summary_str, i) writer.flush() reward_set.append(ep_reward) q_value_set.append((ep_ave_max_q / float(j))) loss_set.append(ep_loss/float(j) * 100) print("------------------------------------------------------------------------------------------------") print('Episode: {:d}, Reward: {:.4f}, Qmax: {:.4f}, loss: {:.4f}'.format(i, ep_reward, (ep_ave_max_q / float(j)), ep_loss/float(j) * 100)) break self.save_model() train_info = pd.DataFrame({"reward": reward_set, "q_value": q_value_set, "loss": loss_set}) train_info.to_csv(self.train_info_path) print('Finish.') return train_info
def train(sess,image_agent,continue_train=False): BUFFER_SIZE = 100000 BATCH_SIZE = 128 GAMMA = 0.9 TAU = 0.001 INIT_LRA = 0.000001 INIT_LRC = 0.0001 EPISODE_MAX_STEP = 5000 # DECAY_RATE = 0.5 # DECAY_STEP = 3000000 #TOTAL_EPISODE = 30000 TOTAL_EPISODE = 20000 EXPLORE = 500000 CURRENT_STEP=0 actor = ActorNetwork(sess,BATCH_SIZE,TAU,INIT_LRA) critic = CriticNetwork(sess,BATCH_SIZE,TAU,INIT_LRC) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() sess.graph.finalize() ou = OU() # if continue_train: # #TODO: reload network and params # pass buffer_follow = ReplayBuffer(BUFFER_SIZE) buffer_straight = ReplayBuffer(BUFFER_SIZE) buffer_left = ReplayBuffer(BUFFER_SIZE) buffer_right = ReplayBuffer(BUFFER_SIZE) buffer_dict = {0:buffer_follow,1:buffer_left,2:buffer_right,3:buffer_straight} epsilon = 1.0 env = Env("./log","./data",image_agent) #env.reset() for i in range(TOTAL_EPISODE): try: ob = env.reset() except Exception: continue total_reward = 0 episode_step = 0 s_t = ob for j in range(EPISODE_MAX_STEP): if s_t is None or len(s_t)<514: continue epsilon-=1.0/ EXPLORE image_input = s_t[0:-2] speed_input = s_t[-2:-1] #GO_STRAIGHT = 5.0,TURN_RIGHT = 4.0,TURN_LEFT = 3.0,LANE_FOLLOW = 2.0 direction = s_t[-1:] branch_st = int(direction-2) if branch_st == -2: # REACH_GOAL=0 break a_t=np.zeros([1,3]) #steer throttle brake noise_t = np.zeros([1,3]) a_t_pridect = actor.pridect_action(image_input,speed_input,branch_st) noise_t[0][0] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0,0.6,0.3) noise_t[0][1] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0.5,1,0.1) noise_t[0][2] = max(epsilon,0)*ou.function(a_t_pridect[0][0],-0.1,1,0.05) a_t = a_t_pridect+noise_t # if(CURRENT_STEP<10000) and j<50: # a_t[0][2]=0 # a_t[0][1]=max(0.6,a_t[0][1]) try: ob,r_t,done = env.step(a_t[0]) s_t1 = ob if s_t1 is None or len(s_t1)<514: continue buffer_dict[branch_st].add(s_t,a_t[0],r_t,s_t1,done) except Exception: break # train Actor and Critic branch_to_train = random.choice([0,1,2,3]) if buffer_dict[branch_to_train].count()>128: train_ddpg(actor,critic,buffer_dict,BATCH_SIZE,branch_to_train) total_reward+=r_t s_t = s_t1 CURRENT_STEP+=1 episode_step+=1 if (done): break print("buffer lenth:{},{},{},{},total reward:{},current_step:{},total_step:{}".format(buffer_dict[0].count(), buffer_dict[1].count(), buffer_dict[2].count(), buffer_dict[3].count(), total_reward,episode_step,CURRENT_STEP)) if np.mod(i,2000)==0: saver.save(sess,'./model/ddpg_model') with open("./episode.txt","w") as log: log.write(("{},{}\n").format(i,epsilon)) with open("./buffer.pkl","wb") as buffer_log: pickle.dump(buffer_dict, buffer_log)