예제 #1
0
def run():
    env  = make_env.make_env('simple_tag')
    n = env.n
    exploration_noise = []
    actors = []
    for i in range(n):
        # load model
        actors.append(load_model(args["modelFolder"] + str(i) + ".h5"))
        
        exploration_noise.append(OUNoise(mu = np.zeros(env.action_space[i].n)))

    # test for 100 episode
    noise = OUNoise(mu = np.zeros(5))
    import time
    for ep in range(50):
        s = env.reset()
        #if ep == 0:
            #print([i.state.p_pos for i in env.world.borders])
        reward = 0.0
        for step in range(100):
            # time.sleep(0.05)
            env.render()
            actions = []
            for i in range(env.n):
                state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0]))
                predict_action = actors[i].predict(state_input) #+ noise()
                actions.append(predict_action.reshape(env.action_space[i].n,))
            s, r, d, s2 = env.step(actions)
            for i in range(env.n):
                reward += r[i]
            if np.all(d):
                break

        print("Episode: {:5.2f}  | Reward: {:f}".format(ep, reward))
예제 #2
0
def main(args):

    with tf.Session() as sess:
        env  = make_env.make_env('simple_tag')
        n = env.n
        actors = []
        critics = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0
        for i in range(n):
            total_action_dim = total_action_dim + env.action_space[i].n
        for i in range(n):
            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau'])))
            critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['actor_lr']),float(args['tau']),float(args['gamma'])))
            exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i])))

        #if args['use_gym_monitor']:
        #    if not args['render_env']:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True)
        #    else:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True)

        train(sess,env,args,actors,critics,exploration_noise)
예제 #3
0
def main(args):

    with tf.Session() as sess:
        env = gym.make('MountainCarContinuous-v0')
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        n = 1
        actors = []
        critics = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0
        """       
        for i in range(n):
            total_action_dim = total_action_dim + env.action_space[i].n
        for i in range(n):
            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau'])))
            critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['actor_lr']),float(args['tau']),float(args['gamma'])))
            exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i])))
        """
        actors.append(
            ActorNetwork(sess, env.observation_space.shape[0],
                         env.action_space.shape[0], float(args['actor_lr']),
                         float(args['tau']), env.action_space.high))
        critics.append(
            CriticNetwork(sess, 1, env.observation_space.shape[0],
                          env.action_space.shape[0], float(args['actor_lr']),
                          float(args['tau']), float(args['gamma'])))
        exploration_noise.append(
            OUNoise(mu=np.zeros(env.action_space.shape[0])))
        #if args['use_gym_monitor']:
        #    if not args['render_env']:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True)
        #    else:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True)

        train(sess, env, args, actors[0], critics[0], exploration_noise[0])
예제 #4
0
def main(args):
    # Master
    if rank == 0:
        #######################
        # Setting up:
        # - environment, random seed
        # - tensorflow option
        # - network
        # - replay
        #########################
        if not os.path.exists(args["modelFolder"]):
            os.makedirs(args["modelFolder"])
        if not os.path.exists(args["summary_dir"]):
            os.makedirs(args["summary_dir"])
        # env and random seed
        env = make_env.make_env('simple_tag')
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        # tensorflow
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.35)
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options, log_device_placement=False)) as sess:
            # agent number
            n = env.n
            ave_n = 0
            good_n = 0
            for i in env.agents:
                if i.adversary:
                    ave_n += 1
                else:
                    good_n += 1
            # Actor Critic
            n = env.n
            actors = []
            critics = []
            exploration_noise = []
            observation_dim = []
            action_dim = []
            total_action_dim = 0

            # Aversary Agents action spaces
            for i in range(ave_n):
                total_action_dim = total_action_dim + env.action_space[i].n
            # print("total_action_dim {} for cooperative agents".format(total_action_dim))
            for i in range(n):
                observation_dim.append(env.observation_space[i].shape[0])
                action_dim.append(
                    env.action_space[i].n
                )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
                actors.append(
                    ActorNetwork(sess, observation_dim[i], action_dim[i],
                                 float(args['actor_lr']), float(args['tau'])))
                # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))
                if i < ave_n:
                    # MADDPG - centralized Critic
                    critics.append(
                        CriticNetwork(sess, n,
                                      observation_dim[i], total_action_dim,
                                      float(args['critic_lr']),
                                      float(args['tau']),
                                      float(args['gamma'])))
                else:
                    # DDPG
                    critics.append(
                        CriticNetwork(sess, n, observation_dim[i],
                                      action_dim[i], float(args['critic_lr']),
                                      float(args['tau']),
                                      float(args['gamma'])))

                exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

            distributed_train_every_step(sess, env, args, actors, critics,
                                         exploration_noise, ave_n)
    # Worker
    else:
        #######################
        # Setting up:
        # - tensorflow option
        # - network
        #
        #
        env = make_env.make_env('simple_tag')
        np.random.seed(int(args['random_seed']) + rank)
        tf.set_random_seed(int(args['random_seed']) + rank)
        env.seed(int(args['random_seed']) + rank)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.08)
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options, log_device_placement=False)) as sess:
            # agent number
            n = env.n
            ave_n = 0
            good_n = 0
            for i in env.agents:
                if i.adversary:
                    ave_n += 1
                else:
                    good_n += 1
            # Actor Critic
            n = env.n
            actors = []
            exploration_noise = []
            observation_dim = []
            action_dim = []

            for i in range(n):
                observation_dim.append(env.observation_space[i].shape[0])
                action_dim.append(env.action_space[i].n)
                actors.append(
                    ActorNetwork(sess, observation_dim[i], action_dim[i],
                                 float(args['actor_lr']), float(args['tau'])))
                exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))
            collect_batch(env, args, actors, exploration_noise, ave_n)
예제 #5
0
def test(args):
    # env and random seed
    env = make_env.make_env('simple_tag')
    np.random.seed(int(args['random_seed']))
    tf.set_random_seed(int(args['random_seed']))
    # env.seed(int(args['random_seed']))
    # tensorflow
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
    # config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)
    with tf.Session() as sess:
        # agent number
        n = env.n
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        # Actor Critic
        n = env.n
        actors = []
        critics = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n
        for i in range(n):
            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(
                env.action_space[i].n
            )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(
                ActorNetwork(sess, observation_dim[i], action_dim[i],
                             float(args['actor_lr']), float(args['tau'])))
            if i < ave_n:
                # MADDPG - centralized Critic
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i],
                                  total_action_dim, float(args['critic_lr']),
                                  float(args['tau']), float(args['gamma'])))
            else:
                # DDPG
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i], action_dim[i],
                                  float(args['critic_lr']), float(args['tau']),
                                  float(args['gamma'])))
            exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))
        for i in range(n):
            actors[i].mainModel.load_weights(args["modelFolder"] + str(i) +
                                             '_weights' + '.h5')
        for ep in range(10):
            s = env.reset()
            reward = 0.0
            for step in range(200):
                time.sleep(0.03)
                env.render()
                actions = []
                for i in range(env.n):
                    state_input = np.reshape(
                        s[i], (-1, env.observation_space[i].shape[0]))
                    noise = OUNoise(mu=np.zeros(5))
                    actions.append((actors[i].predict(
                        np.reshape(
                            s[i],
                            (-1,
                             actors[i].mainModel.input_shape[1])))).reshape(
                                 actors[i].mainModel.output_shape[1], ))
                s, r, d, s2 = env.step(actions)
                for i in range(env.n):
                    reward += r[i]
                if np.all(d):
                    break
            print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
        env.close()
        import sys
        sys.exit("test over!")
예제 #6
0
def main(args):
    if not os.path.exists(args["modelFolder"]):
        os.makedirs(args["modelFolder"])
    if not os.path.exists(args["summary_dir"]):
        os.makedirs(args["summary_dir"])

    #with tf.device("/gpu:0"):
    # MADDPG for Ave Agent
    # DDPG for Good Agent
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
    config = tf.ConfigProto(device_count={'CPU': 0})
    # config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                          log_device_placement=False)) as sess:
        env = make_env.make_env('simple_tag')

        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        print("adversary ", ave_n, "target ", good_n)
        # print("ave_n", ave_n)
        n = env.n
        actors = []
        critics = []
        brains = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        # Aversary Agents action spaces
        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n

        print("total_action_dim", total_action_dim)

        for i in range(n):

            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(
                env.action_space[i].n
            )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(
                ActorNetwork(sess, observation_dim[i], action_dim[i],
                             float(args['actor_lr']), float(args['tau'])))
            # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))

            if i < ave_n:
                # MADDPG - centralized Critic
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i],
                                  total_action_dim, float(args['critic_lr']),
                                  float(args['tau']), float(args['gamma'])))
            else:
                # DDPG
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i], action_dim[i],
                                  float(args['critic_lr']), float(args['tau']),
                                  float(args['gamma'])))

            exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

        train(sess, env, args, actors, critics, exploration_noise, ave_n)
예제 #7
0
def main(args):

    if not os.path.exists(args["modelFolder"]):
        os.makedirs(args["modelFolder"])
    if not os.path.exists(args["summary_dir"]):
        os.makedirs(args["summary_dir"])

    #with tf.device("/gpu:0"):
    # MADDPG for Ave Agent
    # DDPG for Good Agent
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                          log_device_placement=True)) as sess:

        env = make_env.make_env('simple_tag')

        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        #with tf.device('/cpu:0'):
        #if args["runTest"]:
        #run()
        #import sys
        #sys.exit("test over!")

        # Calculate good and ave agents number
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        print("adversary ", ave_n, "target ", good_n)
        # print("ave_n", ave_n)
        n = env.n
        actors = []
        critics = []
        brains = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        # Aversary Agents action spaces
        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n

        print("total_action_dim", total_action_dim)

        for i in range(n):

            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(
                env.action_space[i].n
            )  # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(
                ActorNetwork(sess, observation_dim[i], action_dim[i],
                             float(args['actor_lr']), float(args['tau'])))
            # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))

            if i < ave_n:
                #MADDPG - centralized Critic
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i],
                                  total_action_dim, float(args['critic_lr']),
                                  float(args['tau']), float(args['gamma'])))
            else:
                # DDPG
                critics.append(
                    CriticNetwork(sess, n, observation_dim[i], action_dim[i],
                                  float(args['critic_lr']), float(args['tau']),
                                  float(args['gamma'])))

            exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))
        """
        print("Test predict")
        s = env.reset()
        # print(s[0])
        actions = []
        for index in range(len(actors)):
            state_input = np.reshape(s[index],(-1,actors[index].state_dim))
            
            actions.append(actors[index].predict(state_input))

            actors[index].predict_target(state_input)


        actions1 = actions[:ave_n]
        actions2 = actions[ave_n:]
        a_temp1 = np.transpose(np.asarray(actions1),(1,0,2))
        a_for_critic1 = np.asarray([x.flatten() for x in a_temp1])
        a_temp2 = np.transpose(np.asarray(actions2),(1,0,2))
        a_for_critic2 = np.asarray([x.flatten() for x in a_temp2])
        for index in range(len(critics)):
            state_input = np.reshape(s[index],(-1,actors[index].state_dim))
            if index < ave_n:
                critics[index].predict_target(state_input, a_for_critic1)
                #critics[index].predict(state_input, a_for_critic1)
            else:
                critics[index].predict_target(state_input, a_for_critic2)
                #critics[index].predict(state_input, a_for_critic2)
        """

        # if args['use_gym_monitor']:
        #    if not args['render_env']:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], video_callable=False, force=True)
        #    else:
        #        envMonitor = wrappers.Monitor(env, args['monitor_dir'], force=True)

        # n brains
        if False:
            for i in range(n):
                observation_dim.append(env.observation_space[i].shape[0])
                action_dim.append(env.action_space[i].n)
                brains.apppen(Brain(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']), \
                                   observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']),float(args['gamma'])))
                exploration_noise.append(OUNoise(mu=np.zeros(action_dim[i])))

            # learn()

        if args["runTest"]:

            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)

            for i in range(n):
                # load model
                actors[i].mainModel.load_weights(args["modelFolder"] + str(i) +
                                                 '_weights' + '.h5')
                # episode 4754
            import time
            #   time.sleep(3)
            for ep in range(10):
                s = env.reset()
                reward = 0.0
                for step in range(200):

                    time.sleep(0.01)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(
                            s[i], (-1, env.observation_space[i].shape[0]))
                        noise = OUNoise(mu=np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append(
                            (actors[i].predict(
                                np.reshape(
                                    s[i],
                                    (-1, actors[i].mainModel.input_shape[1])))
                             ).reshape(actors[i].mainModel.output_shape[1], ))
                    #print("{}".format(actions))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
            env.close()
            import sys
            sys.exit("test over!")

        if False:
            import time
            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)
            for ep in range(10):
                # load model
                s = env.reset()
                for j in range(env.n):
                    actors[j].mainModel.load_weights(args["modelFolder"] +
                                                     str(j) + '_weights' +
                                                     '.h5')
                for step in range(300):

                    reward = 0.0
                    # time.sleep(0.05)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(
                            s[i], (-1, env.observation_space[i].shape[0]))
                        noise = OUNoise(mu=np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append(
                            (actors[i].predict(
                                np.reshape(
                                    s[i],
                                    (-1, actors[i].mainModel.input_shape[1])))
                             ).reshape(actors[i].mainModel.output_shape[1], ))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))

        else:
            if True:
                train(sess, env, args, actors, critics, exploration_noise,
                      ave_n)
            else:
                global graph, global_queue, update_event, rolling_event, global_step_max, global_step, coord, brain
                graph = tf.get_default_graph()
                global_queue = queue.Queue()
                update_event, rolling_event = threading.Event(
                ), threading.Event()
                global_step_max, global_step = 200 * 1000, 0
                coord = tf.train.Coordinator()
                brain = Brain(args["modelFolder"])

                distributed_train(sess, env, args, actors, critics,
                                  exploration_noise, ave_n)
예제 #8
0
def main(args):

    if not os.path.exists(args["modelFolder"]):
        os.makedirs(args["modelFolder"])
    if not os.path.exists(args["summary_dir"]):
        os.makedirs(args["summary_dir"])


    #with tf.device("/gpu:0"):
    # MADDPG for Ave Agent
    # DDPG for Good Agent
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85)
    config = tf.ConfigProto(
        device_count = {'CPU': 0}
    )

    #config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True) 
    
    with tf.Session() as sess:
    # with tf.Session(config=config) as sess:

        env  = make_env.make_env('simple_tag')

        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        #with tf.device('/cpu:0'):
            #if args["runTest"]:
                #run()
                #import sys
                #sys.exit("test over!")

        # Calculate good and ave agents number
        ave_n = 0
        good_n = 0
        for i in env.agents:
            if i.adversary:
                ave_n += 1
            else:
                good_n += 1
        print("adversary ", ave_n, "target ", good_n)
        # print("ave_n", ave_n)
        n = env.n
        actors = []
        critics = []
        brains = []
        exploration_noise = []
        observation_dim = []
        action_dim = []
        total_action_dim = 0

        # Aversary Agents action spaces
        for i in range(ave_n):
            total_action_dim = total_action_dim + env.action_space[i].n

        print("total_action_dim", total_action_dim)

        for i in range(n):

            observation_dim.append(env.observation_space[i].shape[0])
            action_dim.append(env.action_space[i].n) # assuming discrete action space here -> otherwise change to something like env.action_space[i].shape[0]
            actors.append(ActorNetwork(sess,observation_dim[i],action_dim[i],float(args['actor_lr']),float(args['tau'])))
            # critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))
            
            
            if i < ave_n:
                #MADDPG - centralized Critic
                critics.append(CriticNetwork(sess,n,observation_dim[i],total_action_dim,float(args['critic_lr']),float(args['tau']),float(args['gamma'])))
            else:
                # DDPG
                critics.append(CriticNetwork(sess,n,observation_dim[i],action_dim[i],float(args['critic_lr']),float(args['tau']),float(args['gamma'])))
            
            exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i])))


        if False:
            for i in range(n):
                observation_dim.append(env.observation_space[i].shape[0])
                action_dim.append(env.action_space[i].n)
                brains.apppen(Brain(sess, observation_dim[i], action_dim[i], float(args['actor_lr']), float(args['tau']), \
                                   observation_dim[i], total_action_dim, float(args['critic_lr']), float(args['tau']),float(args['gamma'])))
                exploration_noise.append(OUNoise(mu = np.zeros(action_dim[i]))) 

            # learn()

        if args["runTest"]:

            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)

            for i in range(n):
                # load model
                actors[i].mainModel.load_weights(args["modelFolder"]+ "ep10000/" +str(i)+'_weights'+'.h5')
                # episode 4754
            import time
            #   time.sleep(3)
            for ep in range(10):
                s = env.reset()
                reward = 0.0
                for step in range(200):
                    
                    time.sleep(0.01)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0]))
                        noise = OUNoise(mu = np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append((actors[i].predict(np.reshape(s[i],(-1, actors[i].mainModel.input_shape[1])))).reshape(actors[i].mainModel.output_shape[1],))
                    #print("{}".format(actions))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
            env.close()
            import sys
            sys.exit("test over!")

        if False:
            import time
            # , force=True
            # env = wrappers.Monitor(env, args["monitor_dir"], force=True)
            for ep in range(10):
                # load model
                s = env.reset()
                for j in range(env.n):
                    actors[j].mainModel.load_weights(args["modelFolder"]+ str(j) +'_weights'+'.h5')
                for step in range(300):
                    
                    reward = 0.0
                    # time.sleep(0.05)
                    env.render()
                    actions = []
                    for i in range(env.n):
                        state_input = np.reshape(s[i],(-1,env.observation_space[i].shape[0]))
                        noise = OUNoise(mu = np.zeros(5))
                        # predict_action = actors[i].predict(state_input) #+ exploration_noise[i]()
                        # actions.append(predict_action.reshape(env.action_space[i].n,))
                        # +noise()
                        actions.append((actors[i].predict(np.reshape(s[i],(-1, actors[i].mainModel.input_shape[1])))).reshape(actors[i].mainModel.output_shape[1],))
                    s, r, d, s2 = env.step(actions)
                    for i in range(env.n):
                        reward += r[i]
                    if np.all(d):
                        break
                print("Episode: {:d}  | Reward: {:f}".format(ep, reward))
            
        else:
            if False: 
                train(sess,env,args,actors,critics,exploration_noise, ave_n)
            else:
                distributed_train(sess, env, args, actors, critics, exploration_noise, ave_n)
예제 #9
0
import tensorflow as tf
import numpy as np
import make_env
import gym
from keras.models import load_model
from ExplorationNoise import OrnsteinUhlenbeckActionNoise as OUNoise
import time

actors = []
actors.append(load_model('results/actor0/main16000.h5'))
actors.append(load_model('results/actor1/main16000.h5'))
actors.append(load_model('results/actor2/main16000.h5'))

env = make_env.make_env('simple_spread')
s = env.reset()
while (1):
    a = []
    for i in range(env.n):
        actor = actors[i]
        noise = OUNoise(mu=np.zeros(5))
        a.append((actor.predict(np.reshape(s[i], (-1, actor.input_shape[1]))) +
                  noise()).reshape(actor.output_shape[1], ))

    s2, r, done, _ = env.step(
        a)  # a is a list with each element being an array
    env.render()
    s = s2
    print("next episode")
    if np.all(done):
        s = env.reset()
    time.sleep(0.2)
def distributed_train(sess, env, args, actors, critics, noise, ave_n):

    worker_num = 4
    #########
    # Worker session
    #
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05)
    worker_sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                                   log_device_placement=False))

    global workers

    workers = [[] for i in range(worker_num)]
    for actor in actors:
        for worker in workers:
            worker.append(
                ActorNetwork(worker_sess, actor.state_dim, actor.action_dim,
                             actor.lr, actor.tau))
    #######################
    print(len(workers), len(workers[0]))

    global exploration_noise
    exploration_noise = []

    for actor in actors:
        exploration_noise.append(OUNoise(mu=np.zeros(actor.action_dim)))
        actor.update_target()
    for critic in critics:
        critic.update_target()

    pool = mp.Pool(processes=mp.cpu_count() - 1)

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for timestep in range(int(args['max_episodes'] * args['max_episode_len'])):

        start = time.time()

        # print(workers[0].work())
        # jobs = [pool.apply_async(sample.out, ()) for sample in samples]

        jobs = [
            pool.apply_async(work, args=(j, )) for j in range(len(workers))
        ]

        # res = pool.map(samples[0].out, [1,2,3])
        #time.sleep(10)

        for job in jobs:
            data = job.get()
            for item in data:
                (s, a, r, d, s2) = item
                print(item)
                # replayMemory.add(s,a,r,done,s2)

        sleep(10)
        #losses = []
        action_dims_done = 0

        # MADDPG Adversary Agent
        for i in range(ave_n):
            actor = actors[i]
            critic = critics[i]

            s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                int(args['minibatch_size']))
            a = []
            for j in range(ave_n):
                state_batch_j = np.asarray(
                    [x for x in s_batch[:, j]]
                )  #batch processing will be much more efficient even though reshaping will have to be done
                a.append(actors[j].predict_target(state_batch_j))

            a_temp = np.transpose(np.asarray(a), (1, 0, 2))

            a_for_critic = np.asarray([x.flatten() for x in a_temp])
            s2_batch_i = np.asarray([
                x for x in s2_batch[:, i]
            ])  # Checked till this point, should be fine.

            targetQ = critic.predict_target(
                s2_batch_i, a_for_critic)  # Should  work, probably
            yi = []
            for k in range(int(args['minibatch_size'])):
                if d_batch[:, i][k]:
                    yi.append(r_batch[:, i][k])
                else:
                    yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k])
            s_batch_i = np.asarray([x for x in s_batch[:, i]])

            critic.train(
                s_batch_i,
                np.asarray([x.flatten() for x in a_batch[:, 0:ave_n, :]]),
                np.asarray(yi))
            #losses.append(loss)

            actions_pred = []
            for j in range(ave_n):
                state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                actions_pred.append(actors[j].predict(
                    state_batch_j))  # Should work till here, roughly, probably
            a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
            a_for_critic_pred = np.asarray([x.flatten() for x in a_temp])
            s_batch_i = np.asarray([x for x in s_batch[:, i]])
            grads = critic.action_gradients(
                s_batch_i,
                a_for_critic_pred)[:, action_dims_done:action_dims_done +
                                   actor.action_dim]
            actor.train(s_batch_i, grads)
            action_dims_done = action_dims_done + actor.action_dim
        # Only DDPG agent

        for i in range(ave_n, env.n):
            actor = actors[i]
            critic = critics[i]
            s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                int(args["minibatch_size"]))
            s_batch_i = np.asarray([x for x in s_batch[:, i]])
            action = np.asarray(actor.predict_target(s_batch_i))

            action_for_critic = np.asarray([x.flatten() for x in action])
            s2_batch_i = np.asarray([x for x in s2_batch[:, i]])
            targetQ = critic.predict_target(s2_batch_i, action_for_critic)
            y_i = []
            for k in range(int(args['minibatch_size'])):
                # If ep is end
                if d_batch[:, i][k]:
                    y_i.append(r_batch[:, i][k])
                else:
                    y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k])
            # state batch for agent i
            s_batch_i = np.asarray([x for x in s_batch[:, i]])
            critic.train(s_batch_i,
                         np.asarray([x.flatten() for x in a_batch[:, i]]),
                         np.asarray(y_i))
            #losses.append(loss)
            action_for_critic_pred = actor.predict(s2_batch_i)
            gradients = critic.action_gradients(s_batch_i,
                                                action_for_critic_pred)[:, :]
            actor.train(s_batch_i, gradients)

        for i in range(0, env.n):
            actor = actors[i]
            critic = critics[i]
            actor.update_target()
            critic.update_target()

        episode_reward += r

        if timestep % int(args["max_episode_len"]) == 0:
            print("timestep: ", timestep)
            print("time: ", time.time() - start)
            # showReward(episode_reward, env.n, ep, start)
        """