def run(): env = make_env.make_env('simple_tag') n = env.n exploration_noise = [] actors = [] for i in range(n): # load model actors.append(load_model(args["modelFolder"] + str(i) + ".h5")) exploration_noise.append(OUNoise(mu = np.zeros(env.action_space[i].n))) # test for 100 episode noise = OUNoise(mu = np.zeros(5)) import time for ep in range(50): s = env.reset() #if ep == 0: #print([i.state.p_pos for i in env.world.borders]) reward = 0.0 for step in range(100): # time.sleep(0.05) env.render() actions = [] for i in range(env.n): state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0])) predict_action = actors[i].predict(state_input) #+ noise() actions.append(predict_action.reshape(env.action_space[i].n,)) s, r, d, s2 = env.step(actions) for i in range(env.n): reward += r[i] if np.all(d): break print("Episode: {:5.2f} | Reward: {:f}".format(ep, reward))
def main(args): with tf.Session() as sess: env = make_env.make_env('simple_tag') n = env.n actors = [] critics = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 for i in range(n): total_action_dim = total_action_dim + env.action_space[i].n for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau']))) critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['actor_lr']),float(args['tau']),float(args['gamma']))) exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i]))) #if args['use_gym_monitor']: # if not args['render_env']: # envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True) # else: # envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True) train(sess,env,args,actors,critics,exploration_noise)
def main(args): with tf.Session() as sess: env = gym.make('MountainCarContinuous-v0') np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) n = 1 actors = [] critics = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 """ for i in range(n): total_action_dim = total_action_dim + env.action_space[i].n for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau']))) critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['actor_lr']),float(args['tau']),float(args['gamma']))) exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i]))) """ actors.append( ActorNetwork(sess, env.observation_space.shape[0], env.action_space.shape[0], float(args['actor_lr']), float(args['tau']), env.action_space.high)) critics.append( CriticNetwork(sess, 1, env.observation_space.shape[0], env.action_space.shape[0], float(args['actor_lr']), float(args['tau']), float(args['gamma']))) exploration_noise.append( OUNoise(mu=np.zeros(env.action_space.shape[0]))) #if args['use_gym_monitor']: # if not args['render_env']: # envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True) # else: # envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True) train(sess, env, args, actors[0], critics[0], exploration_noise[0])
def main(args): # Master if rank == 0: ####################### # Setting up: # - environment, random seed # - tensorflow option # - network # - replay ######################### if not os.path.exists(args["modelFolder"]): os.makedirs(args["modelFolder"]) if not os.path.exists(args["summary_dir"]): os.makedirs(args["summary_dir"]) # env and random seed env = make_env.make_env('simple_tag') np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) # tensorflow gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.35) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) as sess: # agent number n = env.n ave_n = 0 good_n = 0 for i in env.agents: if i.adversary: ave_n += 1 else: good_n += 1 # Actor Critic n = env.n actors = [] critics = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 # Aversary Agents action spaces for i in range(ave_n): total_action_dim = total_action_dim + env.action_space[i].n # print("total_action_dim {} for cooperative agents".format(total_action_dim)) for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append( env.action_space[i].n ) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append( ActorNetwork(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']))) # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma']))) if i < ave_n: # MADDPG - centralized Critic critics.append( CriticNetwork(sess, n, observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']))) else: # DDPG critics.append( CriticNetwork(sess, n, observation_dim[i], action_dim[i], float(args['critic_lr']), float(args['tau']), float(args['gamma']))) exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i]))) distributed_train_every_step(sess, env, args, actors, critics, exploration_noise, ave_n) # Worker else: ####################### # Setting up: # - tensorflow option # - network # # env = make_env.make_env('simple_tag') np.random.seed(int(args['random_seed']) + rank) tf.set_random_seed(int(args['random_seed']) + rank) env.seed(int(args['random_seed']) + rank) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.08) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) as sess: # agent number n = env.n ave_n = 0 good_n = 0 for i in env.agents: if i.adversary: ave_n += 1 else: good_n += 1 # Actor Critic n = env.n actors = [] exploration_noise = [] observation_dim = [] action_dim = [] for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append(env.action_space[i].n) actors.append( ActorNetwork(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']))) exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i]))) collect_batch(env, args, actors, exploration_noise, ave_n)
def test(args): # env and random seed env = make_env.make_env('simple_tag') np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) # env.seed(int(args['random_seed'])) # tensorflow gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) # config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False) with tf.Session() as sess: # agent number n = env.n ave_n = 0 good_n = 0 for i in env.agents: if i.adversary: ave_n += 1 else: good_n += 1 # Actor Critic n = env.n actors = [] critics = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 for i in range(ave_n): total_action_dim = total_action_dim + env.action_space[i].n for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append( env.action_space[i].n ) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append( ActorNetwork(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']))) if i < ave_n: # MADDPG - centralized Critic critics.append( CriticNetwork(sess, n, observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']))) else: # DDPG critics.append( CriticNetwork(sess, n, observation_dim[i], action_dim[i], float(args['critic_lr']), float(args['tau']), float(args['gamma']))) exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i]))) for i in range(n): actors[i].mainModel.load_weights(args["modelFolder"] + str(i) + '_weights' + '.h5') for ep in range(10): s = env.reset() reward = 0.0 for step in range(200): time.sleep(0.03) env.render() actions = [] for i in range(env.n): state_input = np.reshape( s[i], (-1, env.observation_space[i].shape[0])) noise = OUNoise(mu=np.zeros(5)) actions.append((actors[i].predict( np.reshape( s[i], (-1, actors[i].mainModel.input_shape[1])))).reshape( actors[i].mainModel.output_shape[1], )) s, r, d, s2 = env.step(actions) for i in range(env.n): reward += r[i] if np.all(d): break print("Episode: {:d} | Reward: {:f}".format(ep, reward)) env.close() import sys sys.exit("test over!")
def main(args): if not os.path.exists(args["modelFolder"]): os.makedirs(args["modelFolder"]) if not os.path.exists(args["summary_dir"]): os.makedirs(args["summary_dir"]) #with tf.device("/gpu:0"): # MADDPG for Ave Agent # DDPG for Good Agent gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) config = tf.ConfigProto(device_count={'CPU': 0}) # config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) as sess: env = make_env.make_env('simple_tag') np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) ave_n = 0 good_n = 0 for i in env.agents: if i.adversary: ave_n += 1 else: good_n += 1 print("adversary ", ave_n, "target ", good_n) # print("ave_n", ave_n) n = env.n actors = [] critics = [] brains = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 # Aversary Agents action spaces for i in range(ave_n): total_action_dim = total_action_dim + env.action_space[i].n print("total_action_dim", total_action_dim) for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append( env.action_space[i].n ) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append( ActorNetwork(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']))) # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma']))) if i < ave_n: # MADDPG - centralized Critic critics.append( CriticNetwork(sess, n, observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']))) else: # DDPG critics.append( CriticNetwork(sess, n, observation_dim[i], action_dim[i], float(args['critic_lr']), float(args['tau']), float(args['gamma']))) exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i]))) train(sess, env, args, actors, critics, exploration_noise, ave_n)
def main(args): if not os.path.exists(args["modelFolder"]): os.makedirs(args["modelFolder"]) if not os.path.exists(args["summary_dir"]): os.makedirs(args["summary_dir"]) #with tf.device("/gpu:0"): # MADDPG for Ave Agent # DDPG for Good Agent gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True)) as sess: env = make_env.make_env('simple_tag') np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) #with tf.device('/cpu:0'): #if args["runTest"]: #run() #import sys #sys.exit("test over!") # Calculate good and ave agents number ave_n = 0 good_n = 0 for i in env.agents: if i.adversary: ave_n += 1 else: good_n += 1 print("adversary ", ave_n, "target ", good_n) # print("ave_n", ave_n) n = env.n actors = [] critics = [] brains = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 # Aversary Agents action spaces for i in range(ave_n): total_action_dim = total_action_dim + env.action_space[i].n print("total_action_dim", total_action_dim) for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append( env.action_space[i].n ) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append( ActorNetwork(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']))) # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma']))) if i < ave_n: #MADDPG - centralized Critic critics.append( CriticNetwork(sess, n, observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']))) else: # DDPG critics.append( CriticNetwork(sess, n, observation_dim[i], action_dim[i], float(args['critic_lr']), float(args['tau']), float(args['gamma']))) exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i]))) """ print("Test predict") s = env.reset() # print(s[0]) actions = [] for index in range(len(actors)): state_input = np.reshape(s[index],(-1,actors[index].state_dim)) actions.append(actors[index].predict(state_input)) actors[index].predict_target(state_input) actions1 = actions[:ave_n] actions2 = actions[ave_n:] a_temp1 = np.transpose(np.asarray(actions1),(1,0,2)) a_for_critic1 = np.asarray([x.flatten() for x in a_temp1]) a_temp2 = np.transpose(np.asarray(actions2),(1,0,2)) a_for_critic2 = np.asarray([x.flatten() for x in a_temp2]) for index in range(len(critics)): state_input = np.reshape(s[index],(-1,actors[index].state_dim)) if index < ave_n: critics[index].predict_target(state_input, a_for_critic1) #critics[index].predict(state_input, a_for_critic1) else: critics[index].predict_target(state_input, a_for_critic2) #critics[index].predict(state_input, a_for_critic2) """ # if args['use_gym_monitor']: # if not args['render_env']: # envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True) # else: # envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True) # n brains if False: for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append(env.action_space[i].n) brains.apppen(Brain(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']), \ observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']),float(args['gamma']))) exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i]))) # learn() if args["runTest"]: # , force=True # env = wrappers.Monitor(env, args["monitor_dir"], force=True) for i in range(n): # load model actors[i].mainModel.load_weights(args["modelFolder"] + str(i) + '_weights' + '.h5') # episode 4754 import time # time.sleep(3) for ep in range(10): s = env.reset() reward = 0.0 for step in range(200): time.sleep(0.01) env.render() actions = [] for i in range(env.n): state_input = np.reshape( s[i], (-1, env.observation_space[i].shape[0])) noise = OUNoise(mu=np.zeros(5)) # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]() # actions.append(predict_action.reshape(env.action_space[i].n,)) # +noise() actions.append( (actors[i].predict( np.reshape( s[i], (-1, actors[i].mainModel.input_shape[1]))) ).reshape(actors[i].mainModel.output_shape[1], )) #print("{}".format(actions)) s, r, d, s2 = env.step(actions) for i in range(env.n): reward += r[i] if np.all(d): break print("Episode: {:d} | Reward: {:f}".format(ep, reward)) env.close() import sys sys.exit("test over!") if False: import time # , force=True # env = wrappers.Monitor(env, args["monitor_dir"], force=True) for ep in range(10): # load model s = env.reset() for j in range(env.n): actors[j].mainModel.load_weights(args["modelFolder"] + str(j) + '_weights' + '.h5') for step in range(300): reward = 0.0 # time.sleep(0.05) env.render() actions = [] for i in range(env.n): state_input = np.reshape( s[i], (-1, env.observation_space[i].shape[0])) noise = OUNoise(mu=np.zeros(5)) # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]() # actions.append(predict_action.reshape(env.action_space[i].n,)) # +noise() actions.append( (actors[i].predict( np.reshape( s[i], (-1, actors[i].mainModel.input_shape[1]))) ).reshape(actors[i].mainModel.output_shape[1], )) s, r, d, s2 = env.step(actions) for i in range(env.n): reward += r[i] if np.all(d): break print("Episode: {:d} | Reward: {:f}".format(ep, reward)) else: if True: train(sess, env, args, actors, critics, exploration_noise, ave_n) else: global graph, global_queue, update_event, rolling_event, global_step_max, global_step, coord, brain graph = tf.get_default_graph() global_queue = queue.Queue() update_event, rolling_event = threading.Event( ), threading.Event() global_step_max, global_step = 200 * 1000, 0 coord = tf.train.Coordinator() brain = Brain(args["modelFolder"]) distributed_train(sess, env, args, actors, critics, exploration_noise, ave_n)
def main(args): if not os.path.exists(args["modelFolder"]): os.makedirs(args["modelFolder"]) if not os.path.exists(args["summary_dir"]): os.makedirs(args["summary_dir"]) #with tf.device("/gpu:0"): # MADDPG for Ave Agent # DDPG for Good Agent gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85) config = tf.ConfigProto( device_count = {'CPU': 0} ) #config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True) with tf.Session() as sess: # with tf.Session(config=config) as sess: env = make_env.make_env('simple_tag') np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env.seed(int(args['random_seed'])) #with tf.device('/cpu:0'): #if args["runTest"]: #run() #import sys #sys.exit("test over!") # Calculate good and ave agents number ave_n = 0 good_n = 0 for i in env.agents: if i.adversary: ave_n += 1 else: good_n += 1 print("adversary ", ave_n, "target ", good_n) # print("ave_n", ave_n) n = env.n actors = [] critics = [] brains = [] exploration_noise = [] observation_dim = [] action_dim = [] total_action_dim = 0 # Aversary Agents action spaces for i in range(ave_n): total_action_dim = total_action_dim + env.action_space[i].n print("total_action_dim", total_action_dim) for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0] actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau']))) # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma']))) if i < ave_n: #MADDPG - centralized Critic critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma']))) else: # DDPG critics.append(CriticNetwork(sess,n,observation_dim[i],action_dim[i],float(args['critic_lr']),float(args['tau']),float(args['gamma']))) exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i]))) if False: for i in range(n): observation_dim.append(env.observation_space[i].shape[0]) action_dim.append(env.action_space[i].n) brains.apppen(Brain(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']), \ observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']),float(args['gamma']))) exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i]))) # learn() if args["runTest"]: # , force=True # env = wrappers.Monitor(env, args["monitor_dir"], force=True) for i in range(n): # load model actors[i].mainModel.load_weights(args["modelFolder"]+ "ep10000/" +str(i)+'_weights'+'.h5') # episode 4754 import time # time.sleep(3) for ep in range(10): s = env.reset() reward = 0.0 for step in range(200): time.sleep(0.01) env.render() actions = [] for i in range(env.n): state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0])) noise = OUNoise(mu = np.zeros(5)) # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]() # actions.append(predict_action.reshape(env.action_space[i].n,)) # +noise() actions.append((actors[i].predict(np.reshape(s[i],(-1, actors[i].mainModel.input_shape[1])))).reshape(actors[i].mainModel.output_shape[1],)) #print("{}".format(actions)) s, r, d, s2 = env.step(actions) for i in range(env.n): reward += r[i] if np.all(d): break print("Episode: {:d} | Reward: {:f}".format(ep, reward)) env.close() import sys sys.exit("test over!") if False: import time # , force=True # env = wrappers.Monitor(env, args["monitor_dir"], force=True) for ep in range(10): # load model s = env.reset() for j in range(env.n): actors[j].mainModel.load_weights(args["modelFolder"]+ str(j) +'_weights'+'.h5') for step in range(300): reward = 0.0 # time.sleep(0.05) env.render() actions = [] for i in range(env.n): state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0])) noise = OUNoise(mu = np.zeros(5)) # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]() # actions.append(predict_action.reshape(env.action_space[i].n,)) # +noise() actions.append((actors[i].predict(np.reshape(s[i],(-1, actors[i].mainModel.input_shape[1])))).reshape(actors[i].mainModel.output_shape[1],)) s, r, d, s2 = env.step(actions) for i in range(env.n): reward += r[i] if np.all(d): break print("Episode: {:d} | Reward: {:f}".format(ep, reward)) else: if False: train(sess,env,args,actors,critics,exploration_noise, ave_n) else: distributed_train(sess, env, args, actors, critics, exploration_noise, ave_n)
import tensorflow as tf import numpy as np import make_env import gym from keras.models import load_model from ExplorationNoise import OrnsteinUhlenbeckActionNoise as OUNoise import time actors = [] actors.append(load_model('results/actor0/main16000.h5')) actors.append(load_model('results/actor1/main16000.h5')) actors.append(load_model('results/actor2/main16000.h5')) env = make_env.make_env('simple_spread') s = env.reset() while (1): a = [] for i in range(env.n): actor = actors[i] noise = OUNoise(mu=np.zeros(5)) a.append((actor.predict(np.reshape(s[i], (-1, actor.input_shape[1]))) + noise()).reshape(actor.output_shape[1], )) s2, r, done, _ = env.step( a) # a is a list with each element being an array env.render() s = s2 print("next episode") if np.all(done): s = env.reset() time.sleep(0.2)
def distributed_train(sess, env, args, actors, critics, noise, ave_n): worker_num = 4 ######### # Worker session # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05) worker_sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) global workers workers = [[] for i in range(worker_num)] for actor in actors: for worker in workers: worker.append( ActorNetwork(worker_sess, actor.state_dim, actor.action_dim, actor.lr, actor.tau)) ####################### print(len(workers), len(workers[0])) global exploration_noise exploration_noise = [] for actor in actors: exploration_noise.append(OUNoise(mu=np.zeros(actor.action_dim))) actor.update_target() for critic in critics: critic.update_target() pool = mp.Pool(processes=mp.cpu_count() - 1) replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for timestep in range(int(args['max_episodes'] * args['max_episode_len'])): start = time.time() # print(workers[0].work()) # jobs = [pool.apply_async(sample.out, ()) for sample in samples] jobs = [ pool.apply_async(work, args=(j, )) for j in range(len(workers)) ] # res = pool.map(samples[0].out, [1,2,3]) #time.sleep(10) for job in jobs: data = job.get() for item in data: (s, a, r, d, s2) = item print(item) # replayMemory.add(s,a,r,done,s2) sleep(10) #losses = [] action_dims_done = 0 # MADDPG Adversary Agent for i in range(ave_n): actor = actors[i] critic = critics[i] s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) a = [] for j in range(ave_n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) #batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([ x for x in s2_batch[:, i] ]) # Checked till this point, should be fine. targetQ = critic.predict_target( s2_batch_i, a_for_critic) # Should work, probably yi = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch[:, 0:ave_n, :]]), np.asarray(yi)) #losses.append(loss) actions_pred = [] for j in range(ave_n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append(actors[j].predict( state_batch_j)) # Should work till here, roughly, probably a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray([x.flatten() for x in a_temp]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) grads = critic.action_gradients( s_batch_i, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] actor.train(s_batch_i, grads) action_dims_done = action_dims_done + actor.action_dim # Only DDPG agent for i in range(ave_n, env.n): actor = actors[i] critic = critics[i] s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args["minibatch_size"])) s_batch_i = np.asarray([x for x in s_batch[:, i]]) action = np.asarray(actor.predict_target(s_batch_i)) action_for_critic = np.asarray([x.flatten() for x in action]) s2_batch_i = np.asarray([x for x in s2_batch[:, i]]) targetQ = critic.predict_target(s2_batch_i, action_for_critic) y_i = [] for k in range(int(args['minibatch_size'])): # If ep is end if d_batch[:, i][k]: y_i.append(r_batch[:, i][k]) else: y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) # state batch for agent i s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train(s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i)) #losses.append(loss) action_for_critic_pred = actor.predict(s2_batch_i) gradients = critic.action_gradients(s_batch_i, action_for_critic_pred)[:, :] actor.train(s_batch_i, gradients) for i in range(0, env.n): actor = actors[i] critic = critics[i] actor.update_target() critic.update_target() episode_reward += r if timestep % int(args["max_episode_len"]) == 0: print("timestep: ", timestep) print("time: ", time.time() - start) # showReward(episode_reward, env.n, ep, start) """