class SampleA(object):
    def __init__(self, i, sess):
        self.wid = i
        self.model = ActorNetwork(sess, 1, 2, 0.1, 0.9)

    def out(self, i):
        print(self.wid)
        self.model.update_target()
예제 #2
0
def main(args):

    with tf.Session() as sess:
        env  = make_env.make_env('simple_tag')
        n = env.n
        actors = []
        critics = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0
        for i in range(n):
            total_action_dim = total_action_dim + env.action_space[i].n
        for i in range(n):
            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau'])))
            critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['actor_lr']),float(args['tau']),float(args['gamma'])))
            exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i])))

        #if args['use_gym_monitor']:
        #    if not args['render_env']:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True)
        #    else:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True)

        train(sess,env,args,actors,critics,exploration_noise)
예제 #3
0
def main(args):

    with tf.Session() as sess:
        env = gym.make('MountainCarContinuous-v0')
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        n = 1
        actors = []
        critics = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0
        """       
        for i in range(n):
            total_action_dim = total_action_dim + env.action_space[i].n
        for i in range(n):
            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau'])))
            critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['actor_lr']),float(args['tau']),float(args['gamma'])))
            exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i])))
        """
        actors.append(
            ActorNetwork(sess, env.observation_space.shape[0],
                         env.action_space.shape[0], float(args['actor_lr']),
                         float(args['tau']), env.action_space.high))
        critics.append(
            CriticNetwork(sess, 1, env.observation_space.shape[0],
                          env.action_space.shape[0], float(args['actor_lr']),
                          float(args['tau']), float(args['gamma'])))
        exploration_noise.append(
            OUNoise(mu=np.zeros(env.action_space.shape[0])))
        #if args['use_gym_monitor']:
        #    if not args['render_env']:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True)
        #    else:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True)

        train(sess, env, args, actors[0], critics[0], exploration_noise[0])
예제 #4
0
def main(args):

    if not os.path.exists(args["modelFolder"]):
        os.makedirs(args["modelFolder"])
    if not os.path.exists(args["summary_dir"]):
        os.makedirs(args["summary_dir"])

    #with tf.device("/gpu:0"):
    # MADDPG for Ave Agent
    # DDPG for Good Agent
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15)
    config = tf.ConfigProto(device_count={'CPU': 0})
    # config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                          log_device_placement=False)) as sess:
        # with tf.Session(config=config) as sess:

        env = make_env.make_env('simple_tag')

        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        #with tf.device('/cpu:0'):
        #if args["runTest"]:
        #run()
        #import sys
        #sys.exit("test over!")

        # Calculate good and ave agents number
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        print("adversary ", ave_n, "target ", good_n)
        # print("ave_n", ave_n)
        n = env.n
        actors = []
        critics = []
        brains = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        # Aversary Agents action spaces
        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n

        print("total_action_dim", total_action_dim)

        for i in range(n):

            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(
                env.action_space[i].n
            )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(
                ActorNetwork(sess, observation_dim[i], action_dim[i],
                             float(args['actor_lr']), float(args['tau'])))
            critics.append(
                CriticNetwork(sess, n, observation_dim[i], action_dim[i],
                              float(args['critic_lr']), float(args['tau']),
                              float(args['gamma'])))
            exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

        train(sess, env, args, actors, critics, exploration_noise, ave_n)
예제 #5
0
def main(args):

    if not os.path.exists(args["modelFolder"]):
        os.makedirs(args["modelFolder"])
    if not os.path.exists(args["summary_dir"]):
        os.makedirs(args["summary_dir"])

    #with tf.device("/gpu:0"):
    # MADDPG for Ave Agent
    # DDPG for Good Agent
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                          log_device_placement=True)) as sess:

        env = make_env.make_env('simple_tag')

        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        #with tf.device('/cpu:0'):
        #if args["runTest"]:
        #run()
        #import sys
        #sys.exit("test over!")

        # Calculate good and ave agents number
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        print("adversary ", ave_n, "target ", good_n)
        # print("ave_n", ave_n)
        n = env.n
        actors = []
        critics = []
        brains = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        # Aversary Agents action spaces
        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n

        print("total_action_dim", total_action_dim)

        for i in range(n):

            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(
                env.action_space[i].n
            )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(
                ActorNetwork(sess, observation_dim[i], action_dim[i],
                             float(args['actor_lr']), float(args['tau'])))
            # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))

            if i < ave_n:
                #MADDPG - centralized Critic
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i],
                                  total_action_dim, float(args['critic_lr']),
                                  float(args['tau']), float(args['gamma'])))
            else:
                # DDPG
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i], action_dim[i],
                                  float(args['critic_lr']), float(args['tau']),
                                  float(args['gamma'])))

            exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))
        """
        print("Test predict")
        s = env.reset()
        # print(s[0])
        actions = []
        for index in range(len(actors)):
            state_input = np.reshape(s[index],(-1,actors[index].state_dim))
            
            actions.append(actors[index].predict(state_input))

            actors[index].predict_target(state_input)


        actions1 = actions[:ave_n]
        actions2 = actions[ave_n:]
        a_temp1 = np.transpose(np.asarray(actions1),(1,0,2))
        a_for_critic1 = np.asarray([x.flatten() for x in a_temp1])
        a_temp2 = np.transpose(np.asarray(actions2),(1,0,2))
        a_for_critic2 = np.asarray([x.flatten() for x in a_temp2])
        for index in range(len(critics)):
            state_input = np.reshape(s[index],(-1,actors[index].state_dim))
            if index < ave_n:
                critics[index].predict_target(state_input, a_for_critic1)
                #critics[index].predict(state_input, a_for_critic1)
            else:
                critics[index].predict_target(state_input, a_for_critic2)
                #critics[index].predict(state_input, a_for_critic2)
        """

        # if args['use_gym_monitor']:
        #    if not args['render_env']:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True)
        #    else:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True)

        # n brains
        if False:
            for i in range(n):
                observation_dim.append(env.observation_space[i].shape[0])
                action_dim.append(env.action_space[i].n)
                brains.apppen(Brain(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']), \
                                   observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']),float(args['gamma'])))
                exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

            # learn()

        if args["runTest"]:

            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)

            for i in range(n):
                # load model
                actors[i].mainModel.load_weights(args["modelFolder"] + str(i) +
                                                 '_weights' + '.h5')
                # episode 4754
            import time
            #   time.sleep(3)
            for ep in range(10):
                s = env.reset()
                reward = 0.0
                for step in range(200):

                    time.sleep(0.01)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(
                            s[i], (-1, env.observation_space[i].shape[0]))
                        noise = OUNoise(mu=np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append(
                            (actors[i].predict(
                                np.reshape(
                                    s[i],
                                    (-1, actors[i].mainModel.input_shape[1])))
                             ).reshape(actors[i].mainModel.output_shape[1], ))
                    #print("{}".format(actions))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
            env.close()
            import sys
            sys.exit("test over!")

        if False:
            import time
            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)
            for ep in range(10):
                # load model
                s = env.reset()
                for j in range(env.n):
                    actors[j].mainModel.load_weights(args["modelFolder"] +
                                                     str(j) + '_weights' +
                                                     '.h5')
                for step in range(300):

                    reward = 0.0
                    # time.sleep(0.05)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(
                            s[i], (-1, env.observation_space[i].shape[0]))
                        noise = OUNoise(mu=np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append(
                            (actors[i].predict(
                                np.reshape(
                                    s[i],
                                    (-1, actors[i].mainModel.input_shape[1])))
                             ).reshape(actors[i].mainModel.output_shape[1], ))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))

        else:
            if True:
                train(sess, env, args, actors, critics, exploration_noise,
                      ave_n)
            else:
                global graph, global_queue, update_event, rolling_event, global_step_max, global_step, coord, brain
                graph = tf.get_default_graph()
                global_queue = queue.Queue()
                update_event, rolling_event = threading.Event(
                ), threading.Event()
                global_step_max, global_step = 200 * 1000, 0
                coord = tf.train.Coordinator()
                brain = Brain(args["modelFolder"])

                distributed_train(sess, env, args, actors, critics,
                                  exploration_noise, ave_n)
예제 #6
0
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
#s et_session(tf.Session(config=config))

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

gpu_options = tf.GPUOptions(allow_growth=True,
                            per_process_gpu_memory_fraction=0.5)

sess = tf.Session(
    config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False))

model2 = ActorNetwork(
    tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                     allow_soft_placement=True,
                                     log_device_placement=True)), 3, 4, 0.1,
    0.88)


class Worker():
    def __init__(self):
        self.output = "tensorflow"

    def train(self, weights):
        print("training")
        #with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=True)) as sess:

        model2.update_target()
        model2.mainModel.set_weights(weights)
def distributed_train(sess, env, args, actors, critics, noise, ave_n):

    worker_num = 4
    #########
    # Worker session
    #
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05)
    worker_sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                                   log_device_placement=False))

    global workers

    workers = [[] for i in range(worker_num)]
    for actor in actors:
        for worker in workers:
            worker.append(
                ActorNetwork(worker_sess, actor.state_dim, actor.action_dim,
                             actor.lr, actor.tau))
    #######################
    print(len(workers), len(workers[0]))

    global exploration_noise
    exploration_noise = []

    for actor in actors:
        exploration_noise.append(OUNoise(mu=np.zeros(actor.action_dim)))
        actor.update_target()
    for critic in critics:
        critic.update_target()

    pool = mp.Pool(processes=mp.cpu_count() - 1)

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for timestep in range(int(args['max_episodes'] * args['max_episode_len'])):

        start = time.time()

        # print(workers[0].work())
        # jobs = [pool.apply_async(sample.out, ()) for sample in samples]

        jobs = [
            pool.apply_async(work, args=(j, )) for j in range(len(workers))
        ]

        # res = pool.map(samples[0].out, [1,2,3])
        #time.sleep(10)

        for job in jobs:
            data = job.get()
            for item in data:
                (s, a, r, d, s2) = item
                print(item)
                # replayMemory.add(s,a,r,done,s2)

        sleep(10)
        #losses = []
        action_dims_done = 0

        # MADDPG Adversary Agent
        for i in range(ave_n):
            actor = actors[i]
            critic = critics[i]

            s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                int(args['minibatch_size']))
            a = []
            for j in range(ave_n):
                state_batch_j = np.asarray(
                    [x for x in s_batch[:, j]]
                )  #batch processing will be much more efficient even though reshaping will have to be done
                a.append(actors[j].predict_target(state_batch_j))

            a_temp = np.transpose(np.asarray(a), (1, 0, 2))

            a_for_critic = np.asarray([x.flatten() for x in a_temp])
            s2_batch_i = np.asarray([
                x for x in s2_batch[:, i]
            ])  # Checked till this point, should be fine.

            targetQ = critic.predict_target(
                s2_batch_i, a_for_critic)  # Should  work, probably
            yi = []
            for k in range(int(args['minibatch_size'])):
                if d_batch[:, i][k]:
                    yi.append(r_batch[:, i][k])
                else:
                    yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k])
            s_batch_i = np.asarray([x for x in s_batch[:, i]])

            critic.train(
                s_batch_i,
                np.asarray([x.flatten() for x in a_batch[:, 0:ave_n, :]]),
                np.asarray(yi))
            #losses.append(loss)

            actions_pred = []
            for j in range(ave_n):
                state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                actions_pred.append(actors[j].predict(
                    state_batch_j))  # Should work till here, roughly, probably
            a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
            a_for_critic_pred = np.asarray([x.flatten() for x in a_temp])
            s_batch_i = np.asarray([x for x in s_batch[:, i]])
            grads = critic.action_gradients(
                s_batch_i,
                a_for_critic_pred)[:, action_dims_done:action_dims_done +
                                   actor.action_dim]
            actor.train(s_batch_i, grads)
            action_dims_done = action_dims_done + actor.action_dim
        # Only DDPG agent

        for i in range(ave_n, env.n):
            actor = actors[i]
            critic = critics[i]
            s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                int(args["minibatch_size"]))
            s_batch_i = np.asarray([x for x in s_batch[:, i]])
            action = np.asarray(actor.predict_target(s_batch_i))

            action_for_critic = np.asarray([x.flatten() for x in action])
            s2_batch_i = np.asarray([x for x in s2_batch[:, i]])
            targetQ = critic.predict_target(s2_batch_i, action_for_critic)
            y_i = []
            for k in range(int(args['minibatch_size'])):
                # If ep is end
                if d_batch[:, i][k]:
                    y_i.append(r_batch[:, i][k])
                else:
                    y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k])
            # state batch for agent i
            s_batch_i = np.asarray([x for x in s_batch[:, i]])
            critic.train(s_batch_i,
                         np.asarray([x.flatten() for x in a_batch[:, i]]),
                         np.asarray(y_i))
            #losses.append(loss)
            action_for_critic_pred = actor.predict(s2_batch_i)
            gradients = critic.action_gradients(s_batch_i,
                                                action_for_critic_pred)[:, :]
            actor.train(s_batch_i, gradients)

        for i in range(0, env.n):
            actor = actors[i]
            critic = critics[i]
            actor.update_target()
            critic.update_target()

        episode_reward += r

        if timestep % int(args["max_episode_len"]) == 0:
            print("timestep: ", timestep)
            print("time: ", time.time() - start)
            # showReward(episode_reward, env.n, ep, start)
        """
 def __init__(self, i, sess):
     self.wid = i
     self.model = ActorNetwork(sess, 1, 2, 0.1, 0.9)
예제 #9
0
def main(args):

    if not os.path.exists(args["modelFolder"]):
        os.makedirs(args["modelFolder"])
    if not os.path.exists(args["summary_dir"]):
        os.makedirs(args["summary_dir"])

    #with tf.device("/gpu:0"):
    # MADDPG for Ave Agent
    # DDPG for Good Agent
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
    config = tf.ConfigProto(device_count={'CPU': 0})
    # config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                          log_device_placement=False)) as sess:
        # with tf.Session(config=config) as sess:

        env = make_env.make_env('simple_tag')

        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        #with tf.device('/cpu:0'):
        #if args["runTest"]:
        #run()
        #import sys
        #sys.exit("test over!")

        # Calculate good and ave agents number
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        print("adversary ", ave_n, "target ", good_n)
        # print("ave_n", ave_n)
        n = env.n
        actors = []
        critics = []
        brains = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        # Aversary Agents action spaces
        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n

        print("total_action_dim", total_action_dim)

        for i in range(n):

            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(
                env.action_space[i].n
            )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(
                ActorNetwork(sess, observation_dim[i], action_dim[i],
                             float(args['actor_lr']), float(args['tau'])))
            # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))

            if i < ave_n:
                # MADDPG - centralized Critic
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i],
                                  total_action_dim, float(args['critic_lr']),
                                  float(args['tau']), float(args['gamma'])))
            else:
                # DDPG
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i], action_dim[i],
                                  float(args['critic_lr']), float(args['tau']),
                                  float(args['gamma'])))

            exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

        # n brains
        if False:
            for i in range(n):
                observation_dim.append(env.observation_space[i].shape[0])
                action_dim.append(env.action_space[i].n)
                brains.apppen(Brain(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']), \
                                   observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']),float(args['gamma'])))
                exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

            # learn()

        if args["runTest"]:

            for i in range(n):
                # load model
                # + "../../good_weights/actor"
                actors[i].mainModel.load_weights(args["modelFolder"] + str(i) +
                                                 '_weights' + '.h5')
                # episode 4754
            import time
            #   time.sleep(3)
            for ep in range(10):
                s = env.reset()
                reward = 0.0
                for step in range(200):

                    time.sleep(0.01)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(
                            s[i], (-1, env.observation_space[i].shape[0]))
                        noise = OUNoise(mu=np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append(
                            (actors[i].predict(
                                np.reshape(
                                    s[i],
                                    (-1, actors[i].mainModel.input_shape[1])))
                             ).reshape(actors[i].mainModel.output_shape[1], ))
                    #print("{}".format(actions))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
            env.close()
            import sys
            sys.exit("test over!")

        if False:
            import time
            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)
            for ep in range(10):
                # load model
                s = env.reset()
                for j in range(env.n):
                    actors[j].mainModel.load_weights(args["modelFolder"] +
                                                     str(j) + '_weights' +
                                                     '.h5')
                for step in range(300):

                    reward = 0.0
                    # time.sleep(0.05)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(
                            s[i], (-1, env.observation_space[i].shape[0]))
                        noise = OUNoise(mu=np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append(
                            (actors[i].predict(
                                np.reshape(
                                    s[i],
                                    (-1, actors[i].mainModel.input_shape[1])))
                             ).reshape(actors[i].mainModel.output_shape[1], ))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))

        else:
            if False:
                train(sess, env, args, actors, critics, exploration_noise,
                      ave_n)
            else:
                distributed_train(sess, env, args, actors, critics,
                                  exploration_noise, ave_n)