Beispiel #1
0
def test_marl(task, VectorEnv, obs_type='ram'):
    """ 
    Test env parallel DummyVectorEnv (no multiprocess) & SubprocVectorEnv (multiprocess) for multi-agent pettingzoo games.
    Use EnvVec Wrappers from Tianshou.
    """
    # env = eval(task).parallel_env(obs_type=obs_type)
    env_num = 2
    envs = VectorEnv(
        [lambda: make_env(task, obs_type=obs_type) for _ in range(env_num)])
    print(envs.action_space)

    assert len(envs) == env_num
    # envs.seed(2)  # which is equal to the next line
    envs.seed(np.random.randint(1000, size=env_num).tolist())
    # envs.seed([2, 3, 4, 5, 6, 7, 8, 9])  # set specific seed for each env
    obs = envs.reset()  # reset all environments
    # obs = envs.reset([0, 5, 7])  # reset 3 specific environments
    for i in range(30000):
        print(i)
        actions = [{
            'first_0': np.random.randint(18),
            'second_0': np.random.randint(18)
        } for i in range(env_num)]
        obs, r, done, info = envs.step(actions)  # step synchronously
        envs.render()  # render all environments
        print(r)
    envs.close()  # close all environments
Beispiel #2
0
def test_marl_baseline(task, VectorEnv, obs_type='ram'):
    """ 
    Test env parallel DummyVectorEnv (no multiprocess) & SubprocVectorEnv (multiprocess) for multi-agent pettingzoo games.
    Use EnvVec Wrappers from stable-baseline.
    """
    # env = eval(task).parallel_env(obs_type=obs_type)
    env_num = 2
    envs = VectorEnv(
        [lambda: make_env(task, obs_type=obs_type) for _ in range(env_num)])
    envs.seed(2)  # which is equal to the next line
    obs = envs.reset()  # reset all environments
    for i in range(30000):
        print(i)
        actions = [{
            'first_0': np.random.randint(18),
            'second_0': np.random.randint(18)
        } for i in range(env_num)]
        obs, r, done, info = envs.step(actions)  # step synchronously
        # envs.render()  # cannot render for stable-baseline env vec wrappers
    envs.close()  # close all environments
    parser.add_argument("-e",
                        "--env",
                        default=DEFAULT_ENV_NAME,
                        help="Environment name to use, default=" +
                        DEFAULT_ENV_NAME)
    parser.add_argument("-r",
                        "--record",
                        help="Directory to store video recording")
    parser.add_argument("--no-visualize",
                        default=True,
                        action='store_false',
                        dest='visualize',
                        help="Disable visualization of the game play")
    args = parser.parse_args()

    env = wrappers.make_env(args.env)
    if args.record:
        env = gym.wrappers.Monitor(env, args.record)
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)

    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    root = tf.train.Checkpoint(
        optimizer=optimizer,
        model=net,
        optimizer_step=tf.train.get_or_create_global_step())

    root.restore(tf.train.latest_checkpoint(args.model))

    state = env.reset()
    total_reward = 0.0
    c = collections.Counter()
Beispiel #4
0
def parallel_rollout(id, env_name, model, writer, max_eps, max_timesteps, selfplay_interval, render, \
    model_path, against_baseline=False, selfplay=False, fictitious=False, seed=0):
    """ 
    Paralllel rollout for multi-agent games, in contrast to the iterative rollout manner.
    Parallel: (multi-agent actions are executed in once call of env.step())
    observations_, rewards, dones, infos = env.step(actions)
    actions, observations_, rewards, dones, infos are all dictionaries, 
    with agent name as key and corresponding values.
    """
    env = make_env(env_name, seed, obs_type=obs_type)
    env.reset()  # required by env.agents
    score = {a: 0.0 for a in env.agents}
    print_interval = 20
    save_interval = 100
    epi_len = []
    for n_epi in range(max_eps):
        observations = env.reset()

        for t in range(max_timesteps):
            actions, logprobs = model.choose_action(observations)
            observations_, rewards, dones, infos = env.step(
                actions)  # from discrete to multibinary action
            if render:
                env.render()

            model.put_data((observations, actions, rewards, observations_,
                            logprobs, dones))

            observations = observations_

            for agent_name in env.agents:
                score[agent_name] += rewards[agent_name]

            if np.any(np.array(list(dones.values()))
                      ):  # any agent has a done -> terminate episode
                break

            # if not env.agents: # according to official docu (https://www.pettingzoo.ml/api), single agent will be removed if it recieved done, while others remain
            #     break

        model.train_net()
        epi_len.append(t)
        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}".format(n_epi))
            record_score, record_length = {}, {}
            for agent_name in env.agents:
                avg_score = score[agent_name] / float(print_interval)
                avg_length = int(np.mean(epi_len))
                print(
                    "id : {}, agent :{}, avg score : {:.3f}, avg epi length : {}"
                    .format(id, agent_name, avg_score, avg_length))
                record_score[agent_name] = avg_score
                record_length[agent_name] = avg_length

            writer.add_scalars("ID {}/Scores".format(id), record_score, n_epi)
            writer.add_scalars("ID {}/Episode Length".format(id),
                               record_length, n_epi)

            score = {a: 0.0 for a in env.agents}
            epi_len = []
        if n_epi % save_interval == 0 and n_epi != 0:
            model.save_model('model/mappo_mp')
    model.save_model('model/mappo_mp')
Beispiel #5
0
def main():
    args = get_args()
    log_dir = create_log_dir(args)
    if not args.test:
        writer = SummaryWriter(log_dir)
    else:
        writer = None

    SEED = 721
    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    env = make_env(args.env, SEED, obs_type=obs_type)

    state_spaces = env.observation_spaces
    action_spaces = env.action_spaces
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    learner_args = {'device': args.device}
    env.reset()
    print(env.agents)
    agents = env.agents
    if args.train_both:
        fixed_agents = []
    else:
        fixed_agents = [
            'first_0'
        ]  # SlimeVolley: opponent is the first, the second agent is the learnable one

    if obs_type == 'ram':
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(args.device)
    else:
        # model = PPODiscrete(state_space, action_space, 'CNN', learner_args, **hyperparams).to(device)
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(args.device)

    load_model(model, args)

    for individual_model in model.agents.values():
        individual_model.policy.share_memory()
        individual_model.policy_old.share_memory()
        individual_model.value.share_memory()
        ShareParameters(individual_model.optimizer)

    path = 'model/' + args.env
    os.makedirs(path, exist_ok=True)

    if args.fictitious:
        path = path + '/fictitious_'

    processes = []
    for p in range(args.num_envs):
        process = Process(target=parallel_rollout, args=(p, args.env, model, writer, max_eps, \
            max_timesteps, selfplay_interval,\
            args.render, path, args.against_baseline, \
            args.selfplay, args.fictitious, SEED))  # the args contain shared and not shared
        process.daemon = True  # all processes closed when the main stops
        processes.append(process)

    [p.start() for p in processes]

    [p.join() for p in processes]  # finished at the same time

    env.close()
Beispiel #6
0
def main():
    args = get_args()
    log_dir = create_log_dir(args)
    if not args.test:
        writer = SummaryWriter(log_dir)
    else:
        writer = None

    SEED = 721
    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    # env = make_env(args.env, SEED, obs_type=obs_type)
    VectorEnv = [
        DummyVectorEnv, SubprocVectorEnv
    ][1]  # https://github.com/thu-ml/tianshou/blob/master/tianshou/env/venvs.py
    envs = VectorEnv([
        lambda: make_env(args.env, obs_type=obs_type)
        for _ in range(args.num_envs)
    ])

    envs.seed(np.random.randint(1000,
                                size=args.num_envs).tolist())  # random seeding

    state_spaces = envs.observation_spaces[
        0]  # same for all env instances, so just take one
    action_spaces = envs.action_spaces[
        0]  # same for all env instances, so just take one
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    learner_args = {'device': args.device}
    envs.reset()
    agents = envs.agents[0]  # same for all env instances, so just take one
    print('agents: ', agents)

    if args.train_both:
        fixed_agents = []
    else:
        fixed_agents = [
            'first_0'
        ]  # SlimeVolley: opponent is the first, the second agent is the learnable one

    if obs_type == 'ram':
        model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces,
                                         action_spaces, 'MLP', fixed_agents,
                                         learner_args,
                                         **hyperparams).to(args.device)
    else:
        model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces,
                                         action_spaces, 'CNN', fixed_agents,
                                         learner_args,
                                         **hyperparams).to(args.device)

    load_model(model, args)

    path = f'model/{args.env}/'
    os.makedirs(path, exist_ok=True)

    if args.fictitious:
        path = path + 'fictitious_'

    parallel_rollout(envs, model, writer, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\
        render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \
        fictitious=args.fictitious, test=args.test, args=args)

    envs.close()
    next_state_values = tf.where(dones == 0, next_state_values,
                                 tf.zeros_like(next_state_values))
    expected_state_action_values = next_state_values * GAMMA + rewards_v

    return tf.losses.mean_squared_error(state_action_values,
                                        expected_state_action_values)


if __name__ == "__main__":

    writer = tf.contrib.summary.create_file_writer(
        logdir='runs',
        flush_millis=10000,
        filename_suffix="-dqn-turtlebot3-followline")

    env = wrappers.make_env('Turtlebot3FollowLineCameraEnv-v0')

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None

    state = env.reset()

    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(description='Train or test arguments.')
    parser.add_argument('--env', type=str, help='Environment', required=True)
    parser.add_argument('--ram',
                        dest='ram_obs',
                        action='store_true',
                        default=False)
    parser.add_argument('--render',
                        dest='render',
                        action='store_true',
                        help='Enable openai gym real-time rendering')
    parser.add_argument('--seed',
                        dest='seed',
                        type=int,
                        default=1234,
                        help='Random seed')
    parser.add_argument('--load_agent',
                        dest='load_agent',
                        type=str,
                        default=None,
                        help='Load agent models by specifying: 1, 2, or both')
    parser.add_argument('--against_baseline',
                        dest='against_baseline',
                        action='store_true',
                        default=False)
    parser.add_argument('--fictitious',
                        dest='fictitious',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    env = make_env(args.env, args.seed, obs_type=obs_type)
    exploit_eps = 1000  # episodes to train the exploiter
    evaluate_eps = 10  # evaluate the exploiter after training

    state_spaces = env.observation_spaces
    action_spaces = env.action_spaces
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    device_idx = 0
    device = torch.device(
        "cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
    learner_args = {'device': device}
    env.reset()
    print(env.agents)
    agents = env.agents

    fixed_agents = [
        'second_0'
    ]  # both the model and exploiter fix the second agent, so the first agent of exploiter can learn

    if obs_type == 'ram':
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(device)
        exploiter_ = copy.deepcopy(model)
    else:
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(device)
        exploiter_ = copy.deepcopy(model)

    if args.fictitious:
        model_dir = 'model/{}/fictitious_selfplay/'.format(args.env)
        exploiter_dir = 'model/{}/fictitious_selfplay/exploiter/'.format(
            args.env)
    else:
        model_dir = 'model/{}/selfplay/noise/'.format(args.env)
        exploiter_dir = 'model/{}/selfplay/exploiter/noise/'.format(args.env)
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(exploiter_dir, exist_ok=True)

    # Parse all models saved during training in order
    filelist, epi_list = [], []
    for filename in os.listdir(model_dir):
        if filename.endswith("policy"):
            filelist.append('_'.join(
                filename.split('_')[:-1]))  # remove '_policy' at end
            epi_list.append(int(filename.split('mappo')[0]))
    sort_idx = np.argsort(epi_list).tolist()
    filelist = [x for _, x in sorted(zip(epi_list, filelist))
                ]  # sort filelist according to the sorting of epi_list
    epi_list.sort()  # filelist.sort() will not give correct answer
    print(epi_list)

    # Evaluate/exploit all models saved during training in order
    eval_data = {}
    for f, i in zip(filelist, epi_list):
        print('load model: ', i, f)
        print(model_dir + f)
        exploiter = copy.deepcopy(exploiter_)
        model.load_model(agent_name=fixed_agents[0], path=model_dir + f)
        exploiter_path = exploiter_dir + f

        r, l = exploit(env, model, exploiter, exploit_eps=exploit_eps, eval_eps=evaluate_eps, max_timesteps=max_timesteps,\
            render=args.render, exploiter_path=exploiter_path, against_baseline=args.against_baseline)
        print(f"Evaluate Avg. Reward: {r}, Avg. Length: {l}")
        eval_data[str(i)] = [r, l]
    save_dir = 'data/{}/'.format(args.env)
    os.makedirs(save_dir, exist_ok=True)
    if args.fictitious:
        save_dir += '/fictitious_eval_data.npy'
    else:
        save_dir += '/eval_data.npy'
    np.save(save_dir, eval_data)

    env.close()
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(description='Train or test arguments.')
    parser.add_argument('--train',
                        dest='train',
                        action='store_true',
                        default=False)
    parser.add_argument('--test',
                        dest='test',
                        action='store_true',
                        default=False)
    parser.add_argument('--env', type=str, help='Environment', required=True)
    parser.add_argument('--ram',
                        dest='ram_obs',
                        action='store_true',
                        default=False)
    parser.add_argument('--render',
                        dest='render',
                        action='store_true',
                        help='Enable openai gym real-time rendering')
    parser.add_argument('--seed',
                        dest='seed',
                        type=int,
                        default=1234,
                        help='Random seed')
    parser.add_argument('--load_agent',
                        dest='load_agent',
                        type=str,
                        default=None,
                        help='Load agent models by specifying: 1, 2, or both')
    parser.add_argument('--against_baseline',
                        dest='against_baseline',
                        action='store_true',
                        default=False)
    parser.add_argument('--fictitious',
                        dest='fictitious',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    SEED = np.random.randint(1000)
    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    env = make_env(args.env, SEED, obs_type=obs_type)
    # max_eps = 500000
    # max_timesteps = 10000
    # selfplay_interval = 3000 # interval in a unit of episode to checkpoint a policy and replace its opponent in selfplay
    eval_eps = 100

    state_spaces = env.observation_spaces
    action_spaces = env.action_spaces
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    device_idx = 0
    device = torch.device(
        "cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
    learner_args = {'device': device}
    env.reset()
    print(env.agents)
    agents = env.agents

    fixed_agents = [
        'first_0', 'second_0'
    ]  # SlimeVolley: opponent is the first, the second agent is the learnable one

    if obs_type == 'ram':
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(device)
    else:
        # model = PPODiscrete(state_space, action_space, 'CNN', learner_args, **hyperparams).to(device)
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(device)

    if args.fictitious:
        model_dir = 'model/{}/fictitious_selfplay/'.format(args.env)
    else:
        model_dir = 'model/{}/selfplay/'.format(args.env)
    os.makedirs(model_dir, exist_ok=True)

    filelist, epi_list = [], []
    for filename in os.listdir(model_dir):
        if filename.endswith("policy"):
            filelist.append('_'.join(
                filename.split('_')[:-1]))  # remove '_policy' at end
            epi_list.append(int(filename.split('mappo')[0]))
    sort_idx = np.argsort(epi_list).tolist()
    filelist = [x for _, x in sorted(zip(epi_list, filelist))
                ]  # sort filelist according to the sorting of epi_list
    epi_list.sort()  # filelist.sort() will not give correct answer
    print(epi_list)

    r_list, l_list = [], []
    eval_data = {}
    for f, i in zip(filelist, epi_list):
        print('episode: ', i, f)
        # if i>17000:
        print(model_dir + f)
        model.load_model(agent_name='second_0', path=model_dir + f)

        r, l = parallel_rollout(env, model, max_eps=eval_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\
            render=args.render, model_path=None, against_baseline=args.against_baseline)
        eval_data[str(i)] = [r, l]
    save_dir = 'data/{}'.format(args.env)
    os.makedirs(save_dir, exist_ok=True)
    if args.fictitious:
        save_dir += '/fictitious_eval_data.npy'
    else:
        save_dir += '/eval_data.npy'
    np.save(save_dir, eval_data)

    env.close()
def main():
    args = get_args()
    log_dir = create_log_dir(args)
    if not args.test:
        writer = SummaryWriter(log_dir)
    else:
        writer = None

    SEED = 721
    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    env = make_env(
        args.env, SEED, obs_type=obs_type
    )  # TODO used for providing spaces info, can also modify SubprocVecEnv wrapper
    # https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html?highlight=multiprocessing
    envs = SubprocVecEnv([
        lambda: make_env(args.env, obs_type=obs_type)
        for _ in range(args.num_envs)
    ],
                         start_method='spawn')

    # envs.seed(np.random.randint(1000, size=args.num_envs).tolist())  # random seeding
    envs.seed(SEED)  # fix seeding
    state_spaces = env.observation_spaces
    action_spaces = env.action_spaces
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    learner_args = {'device': args.device}
    env.reset()
    agents = env.agents
    print('agents: ', agents)

    if args.train_both:
        fixed_agents = []
    else:
        fixed_agents = [
            'first_0'
        ]  # SlimeVolley: opponent is the first, the second agent is the learnable one

    if obs_type == 'ram':
        model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces,
                                         action_spaces, 'MLP', fixed_agents,
                                         learner_args,
                                         **hyperparams).to(args.device)
    else:
        model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces,
                                         action_spaces, 'CNN', fixed_agents,
                                         learner_args,
                                         **hyperparams).to(args.device)

    load_model(model, args)

    path = f"model/{args.env}/"
    os.makedirs(path, exist_ok=True)

    if args.fictitious:
        path = path + 'fictitious_'

    parallel_rollout(envs, model, writer, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\
        render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \
        fictitious=args.fictitious, test=args.test)

    envs.close()
def main():
    args = get_args()
    print_args(args)
    log_dir = create_log_dir(args)
    if not args.test:
        writer = SummaryWriter(log_dir)
    else:
        writer = None

    SEED = 721
    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    env = make_env(args.env, SEED, obs_type=obs_type)

    state_spaces = env.observation_spaces
    action_spaces = env.action_spaces
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    learner_args = {'device': args.device}
    env.reset()
    print(env.agents)
    agents = env.agents

    if args.train_both:
        fixed_agents = []
    else:
        fixed_agents = [
            'first_0'
        ]  # SlimeVolley: opponent is the first, the second agent is the learnable one
    path = f"model/{args.env}/"
    os.makedirs(path, exist_ok=True)
    data_path = f"data/{args.env}/"
    os.makedirs(data_path, exist_ok=True)

    if obs_type == 'ram':
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'MLP',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(args.device)
    else:
        # model = PPODiscrete(state_space, action_space, 'CNN', learner_args, **hyperparams).to(device)
        model = MultiPPODiscrete(agents, state_spaces, action_spaces, 'CNN',
                                 fixed_agents, learner_args,
                                 **hyperparams).to(args.device)
        path = path + 'cnn_'
    if args.selfplay:
        os.makedirs(path + 'selfplay/', exist_ok=True)
    load_model(model, args)

    if args.fictitious:
        path = path + 'fictitious_'

    eval_env = make_env(args.env, np.random.randint(0, 100), obs_type=obs_type)
    evaluater = Evaluater(eval_env, max_timesteps)

    parallel_rollout(env, model, writer, evaluater=evaluater, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\
        render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \
        fictitious=args.fictitious, test=args.test)

    env.close()
Beispiel #12
0
    next_state_values = tf.where(dones == 0, next_state_values,
                                 tf.zeros_like(next_state_values))
    expected_state_action_values = next_state_values * GAMMA + rewards_v

    return tf.losses.mean_squared_error(state_action_values,
                                        expected_state_action_values)


if __name__ == "__main__":

    writer = tf.contrib.summary.create_file_writer(
        logdir='runs',
        flush_millis=10000,
        filename_suffix="-dqn-f1-followline")

    env = wrappers.make_env('F1FollowLineCameraEnv-v0')
    env.load_checkpoints('adjusted_checkpoints.json')
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None

    state = env.reset()

    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)