def train(sess, env, args, actors, critics, noise, ave_n, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): summary_ops, summary_vars = build_summaries(env.n) init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # callbacks = [] # train_names = ['train_loss', 'train_mae'] # callback = TensorBoard(args['summary_dir']) for actor in actors: actor.update_target() for critic in critics: # callback = TensorBoard(args['summary_dir']) # callback.set_model(critic.mainModel) # callbacks.append(callback) critic.update_target() replayMemory = None replayMemory_ddpg = None # prioritized_replay_beta_iters = None if args["prioritized"]: replayMemory = PrioritizedReplayMemory(args['buffer_size'], args["prioritized_alpha"]) replayMemory_ddpg = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) else: replayMemory_ddpg = replayMemory = ReplayMemory( int(args['buffer_size']), int(args['random_seed'])) # Prioritized Replay # PrioritizedReplayMemory = PrioritizedReplayMemory(args['buffer_size']) for ep in range(int(args['max_episodes'])): start = time.time() s = env.reset() episode_reward = np.zeros((env.n, )) #episode_av_max_q = 0 for stp in range(int(args['max_episode_len'])): action_dims_done = 0 if args['render_env']: env.render() a = [] for i in range(env.n): actor = actors[i] state_input = np.reshape(s[i], (-1, actor.state_dim)) a.append( actor.act(state_input, noise[i]()).reshape(actor.action_dim, )) s2, r, done, _ = env.step( a) # a is a list with each element being an array #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,))) replayMemory.add(s, a, r, done, s2) replayMemory_ddpg.add(s, a, r, done, s2) # Prioritized Replay Memory # replayMemory.store(s, a, r, done, s2) # replayMemory.sample(int(args["minibatch_size"])) # update priority with loss s = s2 # MADDPG Adversary Agent for i in range(ave_n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch, batch_idxes = None, None, None, None, None, None if args["prioritized"]: experience = replayMemory.sample( args['minibatch_size']) (s_batch, a_batch, r_batch, d_batch, s2_batch, batch_idxes) = experience print(len(batch_idxes)) else: s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) a = [] for j in range(ave_n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) #batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) #print(np.asarray(a).shape) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) #print("a_for_critic", a_temp.shape) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([ x for x in s2_batch[:, i] ]) # Checked till this point, should be fine. # print("s2_batch_i", s2_batch_i.shape) targetQ = critic.predict_target( s2_batch_i, a_for_critic) # Should work, probably yi = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) td_errors = critic.train( s_batch_i, np.asarray( [x.flatten() for x in a_batch[:, 0:ave_n, :]]), np.asarray(yi)) if args["prioritized"]: print(td_errors) new_priorities = np.abs( td_errors) + prioritized_replay_eps print(len(new_priorities)) replayMemory.update_priorities(batch_idxes, new_priorities) actions_pred = [] # for j in range(ave_n): for j in range(ave_n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append( actors[j].predict(state_batch_j) ) # Should work till here, roughly, probably a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) grads = critic.action_gradients( s_batch_i, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] actor.train(s_batch_i, grads) action_dims_done = action_dims_done + actor.action_dim # Only DDPG agent for i in range(ave_n, env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args["minibatch_size"]): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory_ddpg.miniBatch( int(args["minibatch_size"])) s_batch_i = np.asarray([x for x in s_batch[:, i]]) action = np.asarray(actor.predict_target(s_batch_i)) action_for_critic = np.asarray( [x.flatten() for x in action]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) # critic.predict_target(next state batch, actor_target(next state batch)) targetQ = critic.predict_target(s2_batch_i, action_for_critic) y_i = [] for k in range(int(args['minibatch_size'])): # If ep is end if d_batch[:, i][k]: y_i.append(r_batch[:, i][k]) else: y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) # state batch for agent i s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i)) action_for_critic_pred = actor.predict(s2_batch_i) gradients = critic.action_gradients( s_batch_i, action_for_critic_pred)[:, :] actor.train(s_batch_i, gradients) for i in range(0, env.n): actor = actors[i] critic = critics[i] actor.update_target() critic.update_target() episode_reward += r #print(done) if stp == int(args["max_episode_len"]) - 1 or np.all(done): ave_reward = 0.0 good_reward = 0.0 for i in range(env.n): if i < ave_n - 1: ave_reward += episode_reward[i] else: good_reward += episode_reward[i] #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)}) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ave_reward, summary_vars[1]: good_reward }) # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))}) writer.add_summary(summary_str, ep) writer.flush() # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp)))) showReward(episode_reward, env.n, ep, start) break #if stp == int(args['max_episode_len'])-1: #showReward(episode_reward, env.n, ep) # save model if ep % 50 == 0 and ep != 0: print("Starting saving model weights every 50 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, args["modelFolder"]) print("Model weights saved") if ep % 200 == 0 and ep != 0: directory = args["modelFolder"] + "ep" + str(ep) + "/" if not os.path.exists(directory): os.makedirs(directory) print("Starting saving model weights to folder every 200 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, directory) print("Model weights saved to folder")
def train(sess, env, args, actors, critics, noise, ave_n): summary_ops, summary_vars = build_summaries(env.n) init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # callbacks = [] # train_names = ['train_loss', 'train_mae'] # callback = TensorBoard(args['summary_dir']) for actor in actors: actor.update_target() for critic in critics: critic.update_target() #for i in range(20): # print([noise[i]()for i in range(env.n)]) replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for ep in range(int(args['max_episodes'])): start = time.time() s = env.reset() episode_reward = np.zeros((env.n, )) #episode_av_max_q = 0 for stp in range(int(args['max_episode_len'])): action_dims_done = 0 if args['render_env']: env.render() a = [] for i in range(env.n): actor = actors[i] state_input = np.reshape(s[i], (-1, actor.state_dim)) a.append( actor.act(state_input, noise[i]()).reshape(actor.action_dim, )) # print(a) #time.sleep(10) s2, r, done, _ = env.step( a) # a is a list with each element being an array #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,))) #if ep % 50 == 0: # env.render() replayMemory.add(s, a, r, done, s2) s = s2 # MADDPG Adversary Agent for i in range(ave_n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['m_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['m_size'])) a = [] for j in range(ave_n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) #batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) targetQ = critic.predict_target(s2_batch_i, a_for_critic) yi = [] for k in range(int(args['m_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) # a2 = actor.predict_target(s_batch) # Q_target = critic.predict_target(s2_batch, a2) # y = r + gamma * Q_target # TD loss = yi - critic.predict(s_batch, a_batch) s_batch_i = np.asarray([x for x in s_batch[:, i]]) a_batch_data = np.asarray( [x.flatten() for x in a_batch[:, 0:ave_n, :]]) target_q = np.asarray(yi) # loss = batch losses = [] # clip index = 0 # number of losses loss_num = int(int(args['m_size']) / int(args['n_size'])) for i in range(loss_num): loss = critic.get_loss( s_batch_i[index:index + int(args["n_size"])], a_batch_data[index:index + int(args["n_size"])], target_q[index:index + int(args["n_size"])]) losses.append(loss) index += int(args["n_size"]) # which has max loss sorted_index = np.argsort(losses).tolist() max_index = sorted_index[-1] # clip index head = max_index * int(args["n_size"]) tail = head + int(args["n_size"]) # clipped batch data with higher losses prioritized_a_batch = a_batch_data[head:tail] prioritized_s_batch = s_batch_i[head:tail] prioritized_target_q = target_q[head:tail] # critic train critic.train(prioritized_s_batch, prioritized_a_batch, prioritized_target_q) actions_pred = [] # for j in range(ave_n): for j in range(ave_n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append(actors[j].predict( state_batch_j[head:tail])) a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) grads = critic.action_gradients( prioritized_s_batch, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] # actor train actor.train(prioritized_s_batch, grads) action_dims_done = action_dims_done + actor.action_dim # Only DDPG agent for i in range(ave_n, env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args["minibatch_size"]): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args["minibatch_size"])) s_batch_i = np.asarray([x for x in s_batch[:, i]]) action = np.asarray(actor.predict_target(s_batch_i)) action_for_critic = np.asarray( [x.flatten() for x in action]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) targetQ = critic.predict_target(s2_batch_i, action_for_critic) y_i = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: y_i.append(r_batch[:, i][k]) else: y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i)) action_for_critic_pred = actor.predict(s2_batch_i) gradients = critic.action_gradients( s_batch_i, action_for_critic_pred)[:, :] actor.train(s_batch_i, gradients) for i in range(0, env.n): actor = actors[i] critic = critics[i] actor.update_target() critic.update_target() episode_reward += r #print(done) if stp == int(args["max_episode_len"]) - 1 or np.all(done): ave_reward = 0.0 good_reward = 0.0 for i in range(env.n): if i < ave_n: ave_reward += episode_reward[i] else: good_reward += episode_reward[i] #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)}) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ave_reward, summary_vars[1]: good_reward }) # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))}) writer.add_summary(summary_str, ep) writer.flush() # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp)))) showReward(episode_reward, env.n, ep, start) break #if stp == int(args['max_episode_len'])-1: #showReward(episode_reward, env.n, ep) # save model if ep % 50 == 0 and ep != 0: print("Starting saving model weights every 50 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, args["modelFolder"]) print("Model weights saved") if ep % 200 == 0 and ep != 0: directory = args["modelFolder"] + "ep" + str(ep) + "/" if not os.path.exists(directory): os.makedirs(directory) print("Starting saving model weights to folder every 200 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, directory) print("Model weights saved to folder")
def train(sess, env, args, actors, critics, noise): summary_ops, summary_vars = build_summaries(env.n) init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) for actor in actors: actor.update_target() for critic in critics: critic.update_target() replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for ep in range(int(args['max_episodes'])): s = env.reset() episode_reward = np.zeros((env.n, )) #episode_av_max_q = 0 for stp in range(int(args['max_episode_len'])): if args['render_env']: env.render() a = [] action_dims_done = 0 for i in range(env.n): actor = actors[i] a.append( actor.act(np.reshape(s[i], (-1, actor.state_dim)), noise[i]()).reshape(actor.action_dim, )) s2, r, done, _ = env.step( a) # a is a list with each element being an array #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,))) replayMemory.add(s, a, r, done, s2) s = s2 for i in range(env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) a = [] for j in range(env.n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) #batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([ x for x in s2_batch[:, i] ]) # Checked till this point, should be fine. targetQ = critic.predict_target( s2_batch_i, a_for_critic) # Should work, probably yi = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train(s_batch_i, np.asarray([x.flatten() for x in a_batch]), np.asarray(yi)) #predictedQValue = critic.train(s_batch,np.asarray([x.flatten() for x in a_batch]),yi) #episode_av_max_q += np.amax(predictedQValue) actions_pred = [] for j in range(env.n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append( actors[j].predict(state_batch_j) ) # Should work till here, roughly, probably a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) grads = critic.action_gradients( s_batch_i, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] actor.train(s_batch_i, grads) #print("Training agent {}".format(i)) actor.update_target() critic.update_target() action_dims_done = action_dims_done + actor.action_dim episode_reward += r if np.all(done): #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)}) summary_str = sess.run( summary_ops, feed_dict={summary_vars[0]: np.sum(episode_reward)}) writer.action_dims_donesummary(summary_str, ep) writer.flush() #print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp)))) print('|Reward: {:d},{:d},{:d},{:d} | Episode: {:d}'.format( int(episode_reward[0]), int(episode_reward[1]), int(episode_reward[2]), int(episode_reward[3]), ep)) break
def distributed_train(sess, env, args, actors, critics, noise, ave_n): """ 1. replay memory - for each timestep 2. async batch data 3. """ summary_ops, summary_vars = build_summaries(env.n) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) start_time = 0.0 end_time = 0.0 for ep in range(int(args['max_episodes'])): # collecting reward s = env.reset() episode_reward = np.zeros((env.n, )) start = time.time() for step in range(int(args['max_episode_len'])): action_dims_done = 0 a = [] for i in range(env.n): actor = actors[i] state_input = np.reshape(s[i], (-1, actor.state_dim)) a.append( actor.act(state_input, noise[i]()).reshape(actor.action_dim, )) s2, r, done, _ = env.step( a) # a is a list with each element being an array episode_reward += r if replayMemory.size() > int(args["minibatch_size"]): # MADDPG Adversary Agent for i in range(ave_n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['m_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['m_size'])) a = [] for j in range(ave_n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) #batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) a_for_critic = np.asarray( [x.flatten() for x in a_temp]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) targetQ = critic.predict_target( s2_batch_i, a_for_critic) yi = [] for k in range(int(args['m_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) # a2 = actor.predict_target(s_batch) # Q_target = critic.predict_target(s2_batch, a2) # y = r + gamma * Q_target # TD loss = yi - critic.predict(s_batch, a_batch) s_batch_i = np.asarray([x for x in s_batch[:, i]]) a_batch_data = np.asarray( [x.flatten() for x in a_batch[:, 0:ave_n, :]]) target_q = np.asarray(yi) ############################################# ## prioritized_batch ############################################# # loss = batch losses = [] # clip index = 0 # number of losses loss_num = int( int(args['m_size']) / int(args['n_size'])) for i in range(loss_num): loss = critic.get_loss( s_batch_i[index:index + int(args["n_size"])], a_batch_data[index:index + int(args["n_size"])], target_q[index:index + int(args["n_size"])]) losses.append(loss) index += int(args["n_size"]) # which has max loss sorted_index = np.argsort(losses).tolist() max_index = sorted_index[-1] # clip index head = max_index * int(args["n_size"]) tail = head + int(args["n_size"]) # clipped batch data with higher losses prioritized_a_batch = a_batch_data[head:tail] prioritized_s_batch = s_batch_i[head:tail] prioritized_target_q = target_q[head:tail] ############################################# ## prioritized_batch ############################################# # critic train critic.train(prioritized_s_batch, prioritized_a_batch, prioritized_target_q) actions_pred = [] # for j in range(ave_n): for j in range(ave_n): state_batch_j = np.asarray( [x for x in s2_batch[:, j]]) actions_pred.append(actors[j].predict( state_batch_j[head:tail])) a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) grads = critic.action_gradients( prioritized_s_batch, a_for_critic_pred )[:, action_dims_done:action_dims_done + actor.action_dim] # actor train actor.train(prioritized_s_batch, grads) action_dims_done = action_dims_done + actor.action_dim # Only DDPG agent for i in range(ave_n, env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args["minibatch_size"]): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args["minibatch_size"])) s_batch_i = np.asarray([x for x in s_batch[:, i]]) action = np.asarray(actor.predict_target(s_batch_i)) action_for_critic = np.asarray( [x.flatten() for x in action]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) targetQ = critic.predict_target( s2_batch_i, action_for_critic) y_i = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: y_i.append(r_batch[:, i][k]) else: y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i)) action_for_critic_pred = actor.predict(s2_batch_i) gradients = critic.action_gradients( s_batch_i, action_for_critic_pred)[:, :] actor.train(s_batch_i, gradients) for i in range(0, env.n): actor = actors[i] critic = critics[i] actor.update_target() critic.update_target() if step == int(args["max_episode_len"]) - 1 or np.all(done): ############################################# ## Record reward data into tensorboard ############################################# ave_reward = 0.0 good_reward = 0.0 for i in range(env.n): if i < ave_n: ave_reward += episode_reward[i] else: good_reward += episode_reward[i] #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)}) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ave_reward, summary_vars[1]: good_reward }) # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))}) writer.add_summary(summary_str, ep) writer.flush() showReward(episode_reward, env.n, ep, start) break if ep % 50 == 0 and ep != 0: print("Starting saving model weights every 50 episodes") for i in range(env.n): saveWeights(actors[i], i, args["modelFolder"]) print("Model weights saved") if ep % 100 == 0 and ep != 0: directory = args["modelFolder"] + "ep" + str(ep) + "/" if not os.path.exists(directory): os.makedirs(directory) print("Starting saving model weights to folder every 100 episodes") for i in range(env.n): saveWeights(actors[i], i, directory) print("Model weights saved to folder") # recieve batch data from workers batch_data = [comm.recv(source=i, tag=i) for i in range(1, size)] for batch in batch_data: for item in batch: (s, a, r, d, s2) = item replayMemory.add(s, a, r, d, s2) # send weights to workers actor_weights = [actor.mainModel.get_weights() for actor in actors] for i in range(1, size): comm.send(actor_weights, dest=i, tag=i)
print('Training for ' + str(frames) + ' frames.') print('Batch size = ', batch_size) print('Initial memory size = ', len(memory.data)) print('Update Q target frequency = ', update_frequency) print('Evaluation frequency = ', evaluation_frequency) n = 0 j = 0 while n in range(frames): done = False initial_state = env.reset() action = agent.getAction(LazyFrame2Torch(initial_state)) state, reward, done, _ = env.step(action) memory.add(initial_state, action, reward, state, done) agent.decrease_epsilon() n += 1 #5367 while (not done): action = agent.getAction(LazyFrame2Torch(state)) next_state, reward, done, _ = env.step(action) memory.add(state, action, reward, next_state, done) state = next_state agent.decrease_epsilon() n += 1 #5368 if memory.current_size >= batch_size: # get batch of size 32 from replay memory
def train(sess,env,args,actors,critics,noise, ave_n): summary_ops,summary_vars = build_summaries(env.n) init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # callbacks = [] # train_names = ['train_loss', 'train_mae'] # callback = TensorBoard(args['summary_dir']) for actor in actors: actor.update_target() for critic in critics: # callback = TensorBoard(args['summary_dir']) # callback.set_model(critic.mainModel) # callbacks.append(callback) critic.update_target() replayMemory = ReplayMemory(int(args['buffer_size']),int(args['random_seed'])) for ep in range(int(args['max_episodes'])): start = time.time() s = env.reset() episode_reward = np.zeros((env.n,)) #episode_av_max_q = 0 for stp in range(int(args['max_episode_len'])): losses = [] # action_dims_done = 0 if args['render_env']: env.render() a = [] for i in range(env.n): actor = actors[i] state_input = np.reshape(s[i],(-1,actor.state_dim)) a.append(actor.act(state_input, noise[i]()).reshape(actor.action_dim,)) s2,r,done,_ = env.step(a) # a is a list with each element being an array #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,))) replayMemory.add(s,a,r,done,s2) s = s2 # Only DDPG agent for i in range(env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args["minibatch_size"]): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(int(args["minibatch_size"])) # action for critic s_batch_i = np.asarray([x for x in s_batch[:,i]]) action = np.asarray(actor.predict_target(s_batch_i)) action_for_critic = np.asarray([x.flatten() for x in action]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) # critic.predict_target(next state batch, actor_target(next state batch)) targetQ = critic.predict_target(s2_batch_i, action_for_critic) y_i = [] for k in range(int(args['minibatch_size'])): # If ep is end if d_batch[:, i][k]: y_i.append(r_batch[:, i][k]) else: y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) # state batch for agent i s_batch_i= np.asarray([x for x in s_batch[:, i]]) critic.train(s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i)) action_for_critic_pred = actor.predict(s2_batch_i) gradients = critic.action_gradients(s_batch_i, action_for_critic_pred)[:, :] actor.train(s_batch_i, gradients) for i in range(0, env.n): actor = actors[i] critic = critics[i] actor.update_target() critic.update_target() episode_reward += r #print(done) if stp == int(args["max_episode_len"])-1 or np.all(done) : ave_reward = 0.0 good_reward = 0.0 """ for i in range(env.n): if i < ave_n: ave_reward += episode_reward[i] else: good_reward += episode_reward[i] """ ave_reward = episode_reward[0] good_reward = episode_reward[2] #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)}) summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: ave_reward, summary_vars[1]: good_reward}) # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))}) writer.add_summary(summary_str, ep) writer.flush() # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp)))) showReward(episode_reward, env.n, ep, start) break #if stp == int(args['max_episode_len'])-1: #showReward(episode_reward, env.n, ep) # save model if ep % 50 == 0 and ep != 0: print("Starting saving model weights every 50 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, args["modelFolder"]) print("Model weights saved") if ep % 200 == 0 and ep != 0: directory = args["modelFolder"] + "ep" + str(ep) + "/" if not os.path.exists(directory): os.makedirs(directory) print("Starting saving model weights to folder every 200 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, directory) print("Model weights saved to folder")
def train_ddpg(env, num_steps, replay_size, batch_size, gamma, noise, num_saves=5, replay_prepopulate_steps=0, lr_critic=1e-2, lr_actor=1e-4, tau=0.001, max_action=1.0, min_action=-1.0, rand_face=False): # get the state_size from the environment state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] # initialize the Critic and target Critic models critic_model = Critic(state_size, action_size).to(device) critic_target = copy.deepcopy(critic_model) # initialize the Actor and target Actor models actor_model = Actor(state_size, action_size, 1.0).to(device) actor_target = copy.deepcopy(actor_model) # initialize the optimizer critic_optimizer = torch.optim.Adam(critic_model.parameters(), weight_decay=lr_critic) actor_optimizer = torch.optim.Adam(actor_model.parameters(), lr=lr_actor) # initialize the replay memory and prepopulate it memory = ReplayMemory(replay_size, state_size, action_size) memory.populate(env, replay_prepopulate_steps, rand_face) # initiate lists to store returns, lengths and losses returns = [] lengths = [] losses = [] # initiate structures to store the models at different stages of training t_saves = np.linspace(0, num_steps, num_saves - 1, endpoint=False) saved_models = {} saved_policies = {} i_episode = 0 # use this to indicate the index of the current episode t_episode = 0 # use this to indicate the time-step inside current episode G = 0 # initializing return variable (incremental tally during each episode) state = env.reset(rand_face) # initialize state of first episode # iterate for a total of `num_steps` steps pbar = tqdm.trange(num_steps) for t_total in pbar: # use t_total to indicate the time-step from the beginning of training # save model if t_total in t_saves: model_name = '%04.1f' % (100 * t_total / num_steps) model_name = model_name.replace('.', '_') saved_models[model_name] = copy.deepcopy(critic_model) saved_policies[model_name] = copy.deepcopy(actor_model) action_arr = (actor_model( torch.FloatTensor(state).to(device)).cpu().data.numpy()) + noise() action_arr = (((action_arr + 1.0) / 2.0) * (max_action - min_action)) + min_action action = np.clip(action_arr, a_min=min_action, a_max=max_action) ss, r, done, info = env.step(action) memory.add(state=state, action=action, reward=r, next_state=ss, done=done) batch = memory.sample(batch_size) loss = train_ddpg_batch(critic_optimizer, actor_optimizer, batch, critic_model, critic_target, actor_model, actor_target, gamma, tau) losses.append(loss) if done: # When episode is done, collect return, update counters, and reset env G += (gamma**t_episode) * r returns.append(G) pbar.set_description('Episode: %d | Steps: %d | Return: %5.2f' % (i_episode, t_episode + 1, G)) lengths.append(t_episode + 1) t_episode = 0 i_episode += 1 G = 0 state = env.reset(rand_face) else: # While episode is not done, move state pointer forward and update return state = ss G += (gamma**t_episode) * r t_episode += 1 saved_models['100_0'] = copy.deepcopy(critic_model) saved_policies['100_0'] = copy.deepcopy(actor_model) return ( saved_models, saved_policies, np.array(returns), np.array(lengths), np.array(losses), )
terminal = 0 reward = 0 if done: terminal = 1 if not step >= 195: reward = -1 sum_reward += reward obs_queue.put(obs) reward_queue.put(reward) action_queue.put(action) step_reward = step_reward / gamma + reward * (gamma**N_STEP) if step >= N_STEP - 1: memory.add(obs_queue.get(), action_queue.get(), step_reward, next_obs, terminal) step_reward -= reward_queue.get() if done: while not action_queue.empty(): step_reward = step_reward / gamma memory.add(obs_queue.get(), action_queue.get(), step_reward, next_obs, terminal) step_reward -= reward_queue.get() obs = next_obs.copy() step += 1 total_step += 1 if total_step < initial_exploration: continue
writer = tf.summary.FileWriter("./log", tf.Session().graph) episode_reward = 0 step = 1 while True: #env.render() state1 = state[np.newaxis, :] action, action_matrix, prob = actor.act(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action, reward, done, next_state, prob) state = next_state episode_reward += reward ##############################train###################### if replayMemory.size() >= 128: state_b, action_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniBatch( int(64)) next_state_b_value = actor.predict(next_state_b) state_b_value = actor.predict(state_b) length = state_b.shape[0] for i in range(length): target_next = reward_b[i] if not done_b[i]:
step_reward = step_reward / gamma + reward * (gamma**N_STEP) if step >= N_STEP - 1: with torch.no_grad(): max_next_q_value_index = qf(torch.Tensor([next_obs])).max( dim=1, keepdim=True)[1].numpy().squeeze() max_next_q_value = target_qf(torch.Tensor( [next_obs]))[0][max_next_q_value_index].numpy() current_state = obs_queue.get() current_action = action_queue.get() q_value = qf(torch.Tensor([current_state ]))[0][current_action].numpy() td_error = abs(step_reward + max_next_q_value * (gamma**N_STEP) - q_value) priority = td_error memory.add(current_state, current_action, step_reward, next_obs, priority, terminal) step_reward -= reward_queue.get() if done: while not action_queue.empty(): with torch.no_grad(): step_reward = step_reward / gamma max_next_q_value_index = qf(torch.Tensor([next_obs])).max( dim=1, keepdim=True)[1].numpy().squeeze() max_next_q_value = target_qf(torch.Tensor( [next_obs]))[0][max_next_q_value_index].numpy() current_state = obs_queue.get() current_action = action_queue.get() q_value = qf(torch.Tensor([current_state ]))[0][current_action].numpy() td_error = abs(step_reward + max_next_q_value *
def train(sess, env, args, actors, critics, noise, ave_n): summary_ops, summary_vars = build_summaries(env.n) init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # callbacks = [] # train_names = ['train_loss', 'train_mae'] # callback = TensorBoard(args['summary_dir']) for actor in actors: actor.update_target() for critic in critics: # callback = TensorBoard(args['summary_dir']) # callback.set_model(critic.mainModel) # callbacks.append(callback) critic.update_target() replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for ep in range(int(args['max_episodes'])): start = time.time() s = env.reset() episode_reward = np.zeros((env.n, )) #episode_av_max_q = 0 for stp in range(int(args['max_episode_len'])): losses = [] action_dims_done = 0 if args['render_env']: env.render() a = [] for i in range(env.n): actor = actors[i] state_input = np.reshape(s[i], (-1, actor.state_dim)) a.append( actor.act(state_input, noise[i]()).reshape(actor.action_dim, )) s2, r, done, _ = env.step( a) # a is a list with each element being an array #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,))) replayMemory.add(s, a, r, done, s2) s = s2 # MADDPG Adversary Agent for i in range(ave_n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) a = [] for j in range(ave_n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) #batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) #print(np.asarray(a).shape) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) #print("a_for_critic", a_temp.shape) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([ x for x in s2_batch[:, i] ]) # Checked till this point, should be fine. # print("s2_batch_i", s2_batch_i.shape) targetQ = critic.predict_target( s2_batch_i, a_for_critic) # Should work, probably yi = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) # critic.train() #critic.train(s_batch_i,np.asarray([x.flatten() for x in a_batch]),np.asarray(yi)) loss = critic.train( s_batch_i, np.asarray( [x.flatten() for x in a_batch[:, 0:ave_n, :]]), np.asarray(yi)) losses.append(loss) # callback.set_model(critic.mainModel) # write_log(callback, train_names, logs, ep) #predictedQValue = critic.train(s_batch,np.asarray([x.flatten() for x in a_batch]),yi) #episode_av_max_q += np.amax(predictedQValue) actions_pred = [] # for j in range(ave_n): for j in range(ave_n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append( actors[j].predict(state_batch_j) ) # Should work till here, roughly, probably a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) grads = critic.action_gradients( s_batch_i, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] actor.train(s_batch_i, grads) #print("Training agent {}".format(i)) #actor.update_target() #critic.update_target() action_dims_done = action_dims_done + actor.action_dim # Only DDPG agent for i in range(ave_n, env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args["minibatch_size"]): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args["minibatch_size"])) # action for critic s_batch_i = np.asarray([x for x in s_batch[:, i]]) action = np.asarray(actor.predict_target(s_batch_i)) #print("action", action.shape) # a_temp = np.transpose(np.asarray(a),(1,0,2)) # a_for_critic = np.asarray([x.flatten() for x in a_temp]) # for j in range(env.n): # print(np.asarray([x for x in s_batch[:,j]]).shape) action_for_critic = np.asarray( [x.flatten() for x in action]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) # critic.predict_target(next state batch, actor_target(next state batch)) targetQ = critic.predict_target(s2_batch_i, action_for_critic) #print("length: ", len(targetQ)) #print(targetQ) #time.sleep(10) # loss = meanSquare(y - Critic(batch state, batch action) # y = batch_r + gamma * targetQ y_i = [] for k in range(int(args['minibatch_size'])): # If ep is end if d_batch[:, i][k]: y_i.append(r_batch[:, i][k]) else: y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) # state batch for agent i s_batch_i = np.asarray([x for x in s_batch[:, i]]) loss = critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i)) losses.append(loss) # callback.set_model(critic.mainModel) # write_log(callback, train_names, logs, ep) action_for_critic_pred = actor.predict(s2_batch_i) gradients = critic.action_gradients( s_batch_i, action_for_critic_pred)[:, :] # check gradients """ grad_check = tf.check_numerics(gradients, "something wrong with gradients") with tf.control_dependencies([grad_check]): actor.train(s_batch_i, gradients) """ actor.train(s_batch_i, gradients) # actor.update_target() # critic.update_target() for i in range(0, env.n): actor = actors[i] critic = critics[i] actor.update_target() critic.update_target() episode_reward += r #print(done) if stp == int(args["max_episode_len"]) - 1 or np.all(done): """ ave_reward = 0.0 good_reward = 0.0 for i in range(env.n): if i < ave_n - 1: ave_reward += episode_reward[i] else: good_reward += episode_reward[i] """ # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)}) # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: ave_reward, summary_vars[1]: good_reward}) #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))}) #writer.add_summary(summary_str, ep) #writer.flush() # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp)))) showReward(episode_reward, env.n, ep, start) break #if stp == int(args['max_episode_len'])-1: #showReward(episode_reward, env.n, ep) # save model if ep % 50 == 0 and ep != 0: print("Starting saving model weights every 50 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, args["modelFolder"]) print("Model weights saved") if ep % 200 == 0 and ep != 0: directory = args["modelFolder"] + "ep" + str(ep) + "/" if not os.path.exists(directory): os.makedirs(directory) print("Starting saving model weights to folder every 200 episodes") for i in range(env.n): # saveModel(actors[i], i, args["modelFolder"]) saveWeights(actors[i], i, directory) print("Model weights saved to folder")
def train(sess, env, actor, critic, actor_noise): # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('./testsummaries/', sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayMemory(10000) episodes = 200000 for i in range(episodes): s = env.reset() ep_reward = 0 for j in range(1): # Added exploration noise #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) # print('state ', s) # print('prediction ', actor.predict(s.reshape((1,5,1)))) a = actor.predict(s.reshape((1, 5, 1))) #+ actor_noise() # print('action ', a) # print('noise', actor_noise()) s2, r, terminal = env.step(a) replay_buffer.add(s, a.reshape((env.players, 1)), r, s2, terminal) # Keep adding experience to the memory until # there are at least minibatch size samples # print('buffer_size', replay_buffer.size()) if replay_buffer.count() > 32: batch = replay_buffer.getBatch(32) states = np.asarray([seq[0] for seq in batch]) actions = np.asarray([seq[1] for seq in batch]) rewards = np.asarray([seq[2] for seq in batch]) new_states = np.asarray([seq[3] for seq in batch]) dones = np.asarray([seq[4] for seq in batch]) y_t = rewards.copy() #Compute the target values target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: gamma = 0.98 # print(rewards[k].shape, target_q_values[k].shape, y_t[k].shape) y_t[k] = rewards[k] + gamma * target_q_values[k] if (1): # self.loss += self.critic.model.train_on_batch([states, actions], y_t) actions_for_grad = actor.model.predict(states) grads = critic.gradients(states, actions_for_grad) # print('shapes' ,states.shape, actions.shape, y_t.shape ) critic.train(states, actions, y_t) actor.train(states, grads) actor.update_target_network() critic.update_target_network() """ states = [np.expand_dims(seq[0], axis=0) for seq in batch] actions = [np.expand_dims(seq[1], axis=0) for seq in batch] rewards = [seq[2] for seq in batch] new_states = [np.expand_dims(seq[3], axis=0) for seq in batch] dones = [seq[4] for seq in batch] target_q_values = critic.predict_target_separate(new_states, actor.predict_target_separate(new_states)) y_t = deepcopy(target_q_values) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k].reshape((rewards[k].shape[0], 1)) else: gamma = 0.98 y_t[k] = rewards[k].reshape((rewards[k].shape[0], 1)) + gamma * target_q_values[k] actions_for_grads = actor.predict_separate(states) grads = critic.gradients_separate(states, actions_for_grads) actor.train_separate(states, grads) critic.train_separate(states, actions, y_t) actor.update_target_network() critic.update_target_network() """ ep_reward += r # if terminal: # summary_str = sess.run(summary_ops, feed_dict={ # summary_vars[0]: np.sum(ep_reward), # }) # writer.add_summary(summary_str, i) # writer.flush() # break if i % 100 == 0: print("episode {} reward: {}".format(i, np.sum(ep_reward))) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: np.sum(ep_reward), }) writer.add_summary(summary_str, i) writer.flush() break
class Actor: def __init__(self, path, model_path, target_model_path, actor_index): self.path = path self.model_path = model_path self.target_model_path = target_model_path self.actor_index = actor_index self.lr = 1e-3 self.gamma = 0.95 self.epsilon = 0.3 self.batch_size = 32 self.initial_exploration = 500 self.N_STEP = 3 self.step_reward = 0 self.qf = DuelingQFunc() self.target_qf = DuelingQFunc() #model.state_dict():モデルの学習パラメータをとってきている self.target_qf.load_state_dict(self.qf.state_dict()) self.optimizer = optim.Adam(self.qf.parameters(), lr=self.lr) self.criterion = nn.MSELoss() self.env = gym.make('CartPole-v0') self.obs_size = self.env.observation_space.shape[0] self.action_size = self.env.action_space.n self.obs_queue = queue.Queue() self.reward_queue = queue.Queue() self.action_queue = queue.Queue() self.total_step = 0 self.ten_step = 0 self.temporal_memory = ReplayMemory() def run(self): for episode in range(1000): done = False obs = self.env.reset() sum_reward = 0 step = 0 self.step_reward = 0 self.obs_queue = queue.Queue() self.reward_queue = queue.Queue() self.action_queue = queue.Queue() while not done: if random.random() < self.epsilon: action = self.env.action_space.sample() else: action = self.qf.select_action(obs) self.epsilon -= 1e-4 if self.epsilon < 0: self.epsilon = 0 next_obs, reward, done, _ = self.env.step(action) terminal = 0 reward = 0 if done: terminal = 1 if not step >= 195: reward = -1 sum_reward += reward self.obs_queue.put(obs) self.reward_queue.put(reward) self.action_queue.put(action) self.step_reward = self.step_reward / self.gamma + reward * ( self.gamma**self.N_STEP) if step >= self.N_STEP - 1: with torch.no_grad(): max_next_q_value_index = self.qf( torch.Tensor([next_obs])).max( dim=1, keepdim=True)[1].numpy().squeeze() max_next_q_value = self.target_qf( torch.Tensor([ next_obs ]))[0][max_next_q_value_index].numpy() current_state = self.obs_queue.get() current_action = self.action_queue.get() q_value = self.qf(torch.Tensor( [current_state]))[0][current_action].numpy() td_error = abs(self.step_reward + max_next_q_value * (self.gamma**self.N_STEP) - q_value) priority = td_error self.temporal_memory.add(current_state, current_action, self.step_reward, next_obs, priority, terminal) self.step_reward -= self.reward_queue.get() if done: while not self.action_queue.empty(): with torch.no_grad(): self.step_reward = self.step_reward / self.gamma max_next_q_value_index = self.qf( torch.Tensor([next_obs])).max( dim=1, keepdim=True)[1].numpy().squeeze() max_next_q_value = self.target_qf( torch.Tensor([ next_obs ]))[0][max_next_q_value_index].numpy() current_state = self.obs_queue.get() current_action = self.action_queue.get() q_value = self.qf(torch.Tensor( [current_state]))[0][current_action].numpy() td_error = abs(self.step_reward + max_next_q_value * (self.gamma**self.N_STEP) - q_value) priority = td_error self.temporal_memory.add(current_state, current_action, self.step_reward, next_obs, priority, terminal) self.step_reward -= self.reward_queue.get() while True and self.total_step % 50 == 0: try: if os.path.isfile(self.path): #メモリを読み込む trans_memory = torch.load(self.path) #メモリファイルの削除 os.remove(self.path) #メモリに追加 #vstackは一番深い層の要素同士を結合する(http://ailaby.com/vstack_hstack/) #vstack = concatenate(axis = 0) #hstack = concatenate(axis = 1) temporal_memory_size = self.temporal_memory.get_memory_size( ) trans_memory['obs'] = np.vstack( (trans_memory['obs'], self.temporal_memory. obs[:temporal_memory_size])) trans_memory['action'] = np.vstack( (trans_memory['action'], self.temporal_memory. actions[:temporal_memory_size])) trans_memory['reward'] = np.vstack( (trans_memory['reward'], self.temporal_memory. rewards[:temporal_memory_size])) trans_memory['next_obs'] = np.vstack( (trans_memory['next_obs'], self.temporal_memory. next_obs[:temporal_memory_size])) trans_memory['priority'] = np.hstack( (trans_memory['priority'], self.temporal_memory. priorities[:temporal_memory_size])) trans_memory['terminate'] = np.vstack( (trans_memory['terminate'], self.temporal_memory. terminates[:temporal_memory_size])) #メモリを保存 torch.save(trans_memory, self.path) self.temporal_memory = ReplayMemory() break else: trans_memory = dict() temporal_memory_size = self.temporal_memory.get_memory_size( ) trans_memory[ 'obs'] = self.temporal_memory.obs[: temporal_memory_size] trans_memory[ 'action'] = self.temporal_memory.actions[: temporal_memory_size] trans_memory[ 'reward'] = self.temporal_memory.rewards[: temporal_memory_size] trans_memory[ 'next_obs'] = self.temporal_memory.next_obs[: temporal_memory_size] trans_memory[ 'priority'] = self.temporal_memory.priorities[: temporal_memory_size] trans_memory[ 'terminate'] = self.temporal_memory.terminates[: temporal_memory_size] torch.save(trans_memory, self.path) self.temporal_memory = ReplayMemory() break except: #他のプロセスがファイルを開いている場合は、タイミングをずらして開く sleep(np.random.random() * 2 + 2) obs = next_obs.copy() step += 1 self.total_step += 1 if self.total_step < self.initial_exploration: continue if self.total_step % 50 == 0: #Learnerに基づいたネットワークの更新 while True: if os.path.isfile(self.model_path): try: self.qf.load_state_dict( torch.load(self.model_path)) self.target_qf.load_state_dict( torch.load(self.target_model_path)) break except (FileNotFoundError, EOFError, RuntimeError): sleep(np.random.random() * 2 + 2) self.ten_step += step if episode % 10 == 0: print('ID:', self.actor_index, ' episode:', episode, 'return:', self.ten_step / 10.0, 'epsilon:', self.epsilon) self.ten_step = 0
class DriverAgent: def __init__(self, env_name, state_dim, action_dim): self.name = 'DriverAgent' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Tensorflow Session config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Actor & Critic Network self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) self.critic = CriticNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) # Replay Memory self.memory = ReplayMemory(MEMORY_SIZE) # Loss value self.loss = 0 # loading networks. modify as you want self.saver = tf.train.Saver() if not os.path.exists(ckp_dir): print("Could not find old network weights") else: self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name)) print("Successfully loaded:", ckp_name) # Train code def train(self, state, action, reward, next_state, done): # Add information to the replay memory if (not (math.isnan(reward))): self.memory.add(state, action, reward, next_state, done) if self.memory.count() <= START_REPLAY: return # Get batch from the replay memory batch = self.memory.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) # Get target Q value of the critic network target_Q = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Calculate answer(???) < I cannot rememeber name y_t = [] for i in range(len(batch)): if dones[i]: y_t.append(rewards[i]) else: y_t.append(rewards[i] + GAMMA * target_Q[i]) y_t = np.resize(y_t, [BATCH_SIZE, 1]) # Calculate loss value and gradient for each network, and train both _, loss = self.critic.train([states, actions], y_t) a_for_grad = self.actor.predict(states) grads = self.critic.gradients(states, a_for_grad) self.actor.train(states, grads) self.actor.target_train() self.critic.target_train() # save your own network def saveNetwork(self, episode): if not os.path.exists(ckp_dir): os.mkdir(ckp_dir) ckp_name_real = ckp_name + '_' + str(episode) self.saver.save(self.sess, os.path.join(ckp_dir, ckp_name_real)) pass def action(self, state): # return an action by state. action = np.zeros([self.action_dim]) action_pre = self.actor.predict([state]) # ACTION: without noise action[0] = np.clip(action_pre[0][0], -1, 1) action[1] = np.clip(action_pre[0][1], 0, 1) action[2] = np.clip(action_pre[0][2], 0, 1) return action def noise_action(self, state, epsilon): # return an action according to the current policy and exploration noise action = np.zeros([self.action_dim]) noise = np.zeros([self.action_dim]) action_pre = self.actor.predict([state]) noise[0] = epsilon * OU.function(action_pre[0][0], 0.0, 0.80, 0.60) noise[1] = epsilon * OU.function(action_pre[0][1], 0.7, 1.00, 0.10) noise[2] = epsilon * OU.function(action_pre[0][2], -0.1, 1.00, 0.05) # ACTION: with noise action[0] = np.clip(action_pre[0][0] + noise[0], -1, 1) action[1] = np.clip(action_pre[0][1] + noise[1], 0, 1) action[2] = np.clip(action_pre[0][2] + noise[2], 0, 1) return action
def train(sess, env, args, actor, critic, actor_noise): summary_ops, summary_vars = build_summaries() init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) actor.update_target() critic.update_target() replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for i in range(int(args['max_episodes'])): s = env.reset() episode_reward = 0 episode_av_max_q = 0 #if i%50==0: #actor.mainModel.save('results/mountainCar'+str(i)+'.h5') #print("Saving Model now") for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() a = actor.act(np.reshape(s, (-1, actor.state_dim)), actor_noise()) s2, r, done, _ = env.step(a[0]) replayMemory.add(np.reshape(s, (actor.state_dim, )), np.reshape(a, (actor.action_dim, )), r, done, np.reshape(s2, (actor.state_dim, ))) if replayMemory.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) targetQ = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) yi = [] for k in range(int(args['minibatch_size'])): if d_batch[k]: yi.append(r_batch[k]) else: yi.append(r_batch[k] + critic.gamma * targetQ[k]) critic.train(s_batch, a_batch, np.reshape(yi, (int(args['minibatch_size']), 1))) actions_pred = actor.predict(s_batch) grads = critic.action_gradients(s_batch, actions_pred) actor.train(s_batch, grads) actor.update_target() critic.update_target() s = s2 episode_reward += r if done: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('|Reward: {:d}| Episode: {:d}'.format( int(episode_reward), i)) break
action = qf.select_action(obs) epsilon -= 1e-4 if epsilon < 0: epsilon = 0 next_obs, reward, done, _ = env.step(action) terminal = 0 reward = 0 if done: terminal = 1 if not step >= 195: reward = -1 sum_reward += reward memory.add(obs, action, reward, next_obs, terminal) obs = next_obs.copy() step += 1 total_step += 1 if total_step < initial_exploration: continue batch = memory.sample() #各サンプルにおける状態行動の値を取ってくる q_value = qf(batch['obs']).gather(1, batch['actions']) #サンプルごとの処理を同時に行う with torch.no_grad(): #Q-networkにおける最大値のインデックスを取ってくる
writer = tf.summary.FileWriter("./log", tf.Session().graph) episode_reward = 0 step = 0 while True: #env.render() state1 = state[np.newaxis, :] action, action_matrix = actor.predict(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action_matrix, reward, done, next_state) state = next_state episode_reward += reward #train if replayMemory.size() % 128 == 0 or done == True: state_b, action_matrix_b, reward_b, done_b, next_state_b = replayMemory.miniAll() reward_b = reward_b[:, np.newaxis] c_pre = critic.predict(next_state_b) state_pre_value = reward_b + c_pre*0.6
def train(): env = gym.make('LunarLander-v2') state = env.reset() actor = Actor(env.action_space, env.observation_space) critic = Critic(env.action_space, env.observation_space) actor.load() critic.load() replayMemory = ReplayMemory() summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter("./log", tf.Session().graph) episode_reward = 0 step = 1 while True: #env.render() state1 = state[np.newaxis, :] action, action_matrix, prob = actor.predict(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action_matrix, reward, done, next_state, prob) state = next_state episode_reward += reward #train if replayMemory.size() % 128 == 0 or done == True: state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll( ) reward_b = reward_b[:, np.newaxis] c_pre = critic.predict(next_state_b) state_pre_value = reward_b + c_pre * 0.7 state_value = critic.predict(state_b) count = 5000 // step if count > 500: count = 500 if count < 1: count = 1 count = 10 for _ in range(count): critic.train(state_b, state_pre_value) for _ in range(count): actor.train(state_b, state_value, state_pre_value, action_matrix_b, prob_b) replayMemory.clear() ######################## if done: summary_str = tf.Session().run( summary_ops, feed_dict={summary_vars[0]: episode_reward}) writer.add_summary(summary_str, step) writer.flush() ##print("step = ", step, "episode_reward = ", episode_reward) state = env.reset() episode_reward = 0 step += 1 if step % 25 == 0: actor.save() critic.save()