def main(args): with tf.Session() as sess: target_pos = np.array([10., 10., 10.]) init_pose = np.array([0., 0., 0.1, 0., 0., 0.]) # initial pose env = Takeoff_Task(init_pose, target_pos=target_pos) np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) state_dim = env.state_size action_dim = env.action_size action_bound = env.action_high actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma'])) actor_noise = OUActionNoise(mu=np.zeros(action_dim), sigma=0.2) train(sess, env, args, actor, critic, actor_noise)
def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, max_size=1000000, fc1_dims=400, fc2_dims=300, batch_size=64): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='actor') self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='critic') self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_critic') self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, n_actions=2, buffer_size=1e6, batch_size=64): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.replay_buffer = ReplayBuffer(buffer_size) self.sess = tf.Session() self.actor = Actor(alpha, input_dims, n_actions, 'Actor', self.sess, env.action_space.high) self.critic = Critic(beta, input_dims, n_actions, 'Critic', self.sess) self.target_actor = Actor(alpha, input_dims, n_actions, 'TargetActor', self.sess, env.action_space.high) self.target_critic = Critic(beta, input_dims, n_actions, 'TargetCritic', self.sess) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_critic = [ self.target_critic.params[i].assign( tf.multiply(self.critic.params[i], self.tau) + tf.multiply(self.target_critic.params[i], 1. - self.tau)) for i in range(len(self.target_critic.params)) ] self.update_actor = [ self.target_actor.params[i].assign( tf.multiply(self.actor.params[i], self.tau) + tf.multiply(self.target_actor.params[i], 1. - self.tau)) for i in range(len(self.target_actor.params)) ] self.sess.run(tf.global_variables_initializer()) self.update_network_parameters(first=True)
def __init__(self, alpha, beta, input_dims, tau, env, brain_name, gamma=.99, n_actions=2, mem_capacity=1e6, layer1_size=400, layer2_size=300, batch_size=64, multiagent=False, n_agents=None, game_name='Rollerball'): # Initialize memory self.batch_size = batch_size self.memory = ReplayBuffer(mem_capacity) # Initialize noise self.noise = OUActionNoise(np.zeros(n_actions)) # Setup device used for torch computations self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Create actor critic and target networks self.actor = ActorNet(alpha, input_dims, layer1_size, layer2_size, n_actions, name='actor_' + game_name + '_ddpg_model').to(self.device) self.target_actor = ActorNet(alpha, input_dims, layer1_size, layer2_size, n_actions).to(self.device) self.critic = CriticNet(beta, input_dims, layer1_size, layer2_size, n_actions, name='critic_' + game_name + '_ddpg_model').to(self.device) self.target_critic = CriticNet(beta, input_dims, layer1_size, layer2_size, n_actions).to(self.device) # Initialize target nets to be identical to actor and critic networks self.init_networks() # Target networks set to eval, since they are not # trained but simply updated with the target_network_update function self.target_actor.eval() self.target_critic.eval() # Set global parameters self.gamma = gamma self.env = env self.tau = tau self.state_space = input_dims self.action_space = n_actions self.multiagent = multiagent self.brain_name = brain_name if self.multiagent: self.n_agents = n_agents # Plotter object for showing live training graphs and saving them self.plotter = RLPlots('ddpg_training')
def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, tau, gamma, state_space, l1_size, l2_size, l3_size, l4_size, action_space, env, brain_name, multibrain, version, mem_capacity=1e6, batch_size=128, multiagent=False, n_agents=None, eval=False): # Initialize memory self.batch_size = batch_size self.memory = ReplayBuffer(mem_capacity) # Initialize noise # In case of a multiagent environment, create a separate noise object for each agent self.noise = [OUActionNoise(np.zeros(action_space)) for i in range(n_agents)] if multiagent else \ OUActionNoise(np.zeros(action_space)) # Setup device used for torch computations self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') # Create actor critic and target networks self.actor = ActorNet(alpha, state_space, l1_size, l2_size, l3_size, l4_size, action_space, name='actor_' + version + '_ddpg_model').to( self.device) self.target_actor = ActorNet(alpha, state_space, l1_size, l2_size, l3_size, l4_size, action_space).to(self.device) self.critic = CriticNet(beta, state_space, l1_size, l2_size, l3_size, l4_size, action_space, name='critic_' + version + '_ddpg_model').to( self.device) self.target_critic = CriticNet(beta, state_space, l1_size, l2_size, l3_size, l4_size, action_space).to(self.device) # Initialize target nets to be identical to actor and critic networks self.init_networks() # Target networks set to eval, since they are not # trained but simply updated with the target_network_update function self.target_actor.eval() self.target_critic.eval() # Set global parameters self.gamma = gamma self.env = env self.tau = tau self.eval = eval self.state_space = state_space self.action_space = action_space self.multiagent = multiagent self.multibrain = multibrain self.brain_name = brain_name self.n_agents = n_agents if self.multiagent else None # Initialize plotter for showing live training graphs and saving them self.plotter = RLPlots('ddpg_training')
def training(file_name): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(CSV_DIR): os.makedirs(CSV_DIR) if not os.path.isdir(FIGURE_TRAINING_DIR): os.makedirs(FIGURE_TRAINING_DIR) # Load models. actor = Actor(name="actor") actor_target = Actor(name="actor_target") actor_initial_update_op = target_update_op( actor.trainable_variables, actor_target.trainable_variables, 1.0) actor_target_update_op = target_update_op(actor.trainable_variables, actor_target.trainable_variables, TARGET_UPDATE_RATE) critic = Critic(name="critic") critic.build_training() critic_target = Critic(name="critic_target") critic_initial_update_op = target_update_op( critic.trainable_variables, critic_target.trainable_variables, 1.0) critic_target_update_op = target_update_op( critic.trainable_variables, critic_target.trainable_variables, TARGET_UPDATE_RATE) critic_with_actor = Critic(name="critic", A=actor.pi) actor.build_training(critic_with_actor.actor_loss) env = PendulumEnv() replay_buffer = ReplayBuffer(BUFFER_SIZE) action_noise = OUActionNoise(np.zeros(A_LENGTH)) with tf.Session() as sess: # Initialize actor and critic networks. sess.run(tf.global_variables_initializer()) sess.run([actor_initial_update_op, critic_initial_update_op]) list_final_reward = [] additional_episode = int(np.ceil(MIN_BUFFER_SIZE / MAX_FRAME)) for episode in range(-additional_episode, MAX_EPISODE): list_actor_loss = [] list_critic_loss = [] # Reset the environment and noise. s = env.reset() action_noise.reset() for step in range(MAX_FRAME): env.render() # Get action. a = sess.run(actor.pi, feed_dict={actor.S: np.reshape(s, (1, -1))}) noise = action_noise.get_noise() a = a[0] + ACTION_SCALING * noise a = np.clip(a, -ACTION_SCALING, ACTION_SCALING) # Interact with the game engine. s1, r, _, _ = env.step(a) # Add data to the replay buffer. data = [s, a, [r], s1] replay_buffer.append(data) if episode >= 0: for _ in range(BATCHES_PER_STEP): # Sample data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_s, batch_a, batch_r, batch_s1 = [ np.array( [batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0])) ] # Compute the next action. a1 = sess.run(actor_target.pi, feed_dict={actor_target.S: batch_s1}) # Compute the target Q. q1 = sess.run(critic_target.q, feed_dict={ critic_target.S: batch_s1, critic_target.A: a1 }) q_target = batch_r + DISCOUNT * q1 # Update actor and critic. _, _, actor_loss, critic_loss = sess.run( [ actor.train_op, critic.train_op, actor.actor_loss, critic.critic_loss ], feed_dict={ actor.S: batch_s, critic_with_actor.S: batch_s, actor.LR: LR_ACTOR, critic.S: batch_s, critic.A: batch_a, critic.QTarget: q_target, critic.LR: LR_CRITIC }) list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) # Update target networks. sess.run( [actor_target_update_op, critic_target_update_op]) s = s1 # Postprocessing after each episode. if episode >= 0: list_final_reward.append(r) avg_actor_loss = np.mean(list_actor_loss) avg_critic_loss = np.mean(list_critic_loss) print("Episode ", format(episode, "03d"), ":", sep="") print(" Final Reward = ", format(r, ".6f"), ", Actor Loss = ", format(avg_actor_loss, ".6f"), ", Critic Loss = ", format(avg_critic_loss, ".6f"), sep="") # Testing. avg_reward = 0 for i in range(TEST_EPISODE): # Reset the environment and noise. s = env.reset() action_noise.reset() for step in range(MAX_FRAME): env.render() # Get action. a = sess.run(actor.pi, feed_dict={actor.S: np.reshape(s, (1, -1))}) a = a[0] # Interact with the game engine. s, r, _, _ = env.step(a) # Postprocessing after each episode. avg_reward += r avg_reward /= TEST_EPISODE # Save the parameters. saver = tf.train.Saver( [*actor.trainable_variables, *critic.trainable_variables]) saver.save(sess, SAVE_DIR + file_name) tf.contrib.keras.backend.clear_session() env.close() # Store data in the csv file. with open(CSV_DIR + file_name + ".csv", "w") as f: fieldnames = ["Episode", "Final Reward", "Average Reward"] writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") writer.writeheader() for episode in range(MAX_EPISODE): content = { "Episode": episode, "Final Reward": list_final_reward[episode] } if episode == MAX_EPISODE - 1: content.update({"Average Reward": avg_reward}) writer.writerow(content) # Plot the training process. list_episode = list(range(MAX_EPISODE)) f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5)) ax.plot(list_episode, list_final_reward, "r-", label="Final Reward") ax.plot([MAX_EPISODE - 1], [avg_reward], "b.", label="Average Reward") ax.set_title("Final Reward") ax.set_xlabel("Episode") ax.set_ylabel("Reward") ax.legend(loc="lower right") ax.grid() f.savefig(FIGURE_TRAINING_DIR + file_name + ".png") plt.close(f)