예제 #1
0
def init_from_config(env, config, logger):
    """
    Inits the algorithm from a config dict, handles DDPG-like and SAC-like models but not
    a mixture of them (some agents are DDPG and some others are SAC).
    :param env:
    :param config:
    :param logger:
    :return: algorithm
    """
    sup_algos = config.agent_alg in SUPPORTED_ALGOS

    # Initializes agents
    if sup_algos:
        algorithm = MADDPG.init_from_env(env,
                                         agent_alg=config.agent_alg,
                                         adversary_alg=config.adversary_alg,
                                         tau=config.tau,
                                         gamma=config.gamma,
                                         lr=config.lr,
                                         lr_fe_coef=config.lr_fe_coef,
                                         lr_critic_coef=config.lr_critic_coef,
                                         grad_clip_value=config.grad_clip_value,
                                         hidden_dim=config.hidden_dim,
                                         weight_decay=config.weight_decay,
                                         discrete_exploration_scheme=config.discrete_exploration_scheme,
                                         boltzmann_temperature=config.boltzmann_temperature,
                                         feature_extractor=config.feature_extractor,
                                         logger=logger)
    else:
        raise ValueError('Algo is not supported')

    return algorithm
예제 #2
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    # print(config.save_gifs)
    # print(model_path.parent)
    # print(type(model_path.parent))

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    maddpg = MADDPG.init_from_save(str(model_path))
    env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            # print(len(env.render('rgb_array', close=False)))
            # print(type(env.render('rgb_array', close=False)))
            # print(env.render('rgb_array', close=False))
            frames.append(env.render('rgb_array', close=False)[0])
        env.render('human', close=False)
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if config.save_gifs:
                frames.append(env.render('rgb_array', close=False)[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human', close=False)
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames,
                            duration=ifi)

    env.close()
예제 #3
0
def run(config):
    """ main entry func """
    config = setup_evaluation(args)
    logger = ExperimentLogger(config.save_dir,
                              log_std_out=True,
                              use_tensorboard=False)

    # load agent from checkpoint
    if config.checkpoint > 0:
        model_path = "checkpoints/model_ep{}.ckpt".format(config.checkpoint)
    else:
        model_path = "model.ckpt"
    model_path = os.path.join(config.restore, model_path)
    maddpg = MADDPG.init_from_save(model_path)
    if config.copy_checkpoint:
        maddpg.save(config.save_dir + "/model.ckpt")

    # make env runner
    env_func = ENV_MAP[config.env]
    env = env_func(config.scenario,
                   benchmark=False,
                   show_visual_range=config.show_visual_range,
                   **config.env_config)

    # evaluate
    rollouts = maddpg_rollouts(maddpg,
                               env,
                               config.n_episodes,
                               config.episode_length,
                               logger=logger,
                               render=True,
                               save_gifs=True,
                               fps=20)

    # save rollouts
    if save_dir is not None:
        with open(os.path.join(save_dir, "eval_rollouts.pkl"), "w") as f:
            pickle.dump(rollouts, f)

        if config.save_gifs:
            if config.save_gifs_num < 0:
                gif_num = config.n_episodes
            else:
                gif_num = min(config.save_gifs_num, config.n_episodes)
            imageio.mimsave(os.path.join(save_dir, "eval_frames.gif"),
                            rollouts["frames"][:gif_num],
                            duration=ifi)

    env.close()
예제 #4
0
def init_from_save(filepath):
    """
    Inits the algorithm from saved one, handles DDPG-like and SAC-like models but not
    a mixture of them (some agents are DDPG and some others are SAC).
    :param filepath: saved model
    :return: algorithm
    """
    save_dict = torch.load(filepath)
    alg_types = save_dict['init_dict']['alg_types']
    sup_algos = [alg in SUPPORTED_ALGOS for alg in alg_types]
    if all(sup_algos):
        algo = MADDPG.init_from_save_dict(save_dict)
    else:
        raise ValueError('Some algos are not supported')
    return algo
예제 #5
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    if isinstance(env.action_space[0], Box):
        discr_act = False
        get_shape = lambda x: x.shape[0]
    else:  # Discrete
        discr_act = True
        get_shape = lambda x: x.n
    num_out_pol = get_shape(env.action_space[0])

    agent_init_params = {
        'num_in_pol': env.observation_space[0].shape[0],
        'num_out_pol': num_out_pol,
        'num_vars': len(env.agent_types)
    }
    maddpg = MADDPG(agent_init_params,
                    nagents=len(env.agent_types),
                    tau=config.tau,
                    lr=config.lr,
                    hidden_dim=config.hidden_dim,
                    discrete_action=discr_act)

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ], config.hidden_dim * (maddpg.nagents - 1))
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        rnn_hidden = (torch.zeros(
            1,
            config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1),
            config.hidden_dim),
                      torch.zeros(
                          1,
                          config.n_rollout_threads * (maddpg.nagents) *
                          (maddpg.nagents - 1), config.hidden_dim))

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions, new_rnn_hidden = maddpg.step(torch_obs,
                                                              rnn_hidden,
                                                              explore=True)
            hid_to_store = (rnn_hidden[0].detach().contiguous().view(
                config.n_rollout_threads, maddpg.nagents,
                -1), rnn_hidden[1].detach().contiguous().view(
                    config.n_rollout_threads, maddpg.nagents, -1))
            next_hid_to_store = (new_rnn_hidden[0].detach().contiguous().view(
                config.n_rollout_threads, maddpg.nagents,
                -1), new_rnn_hidden[1].detach().contiguous().view(
                    config.n_rollout_threads, maddpg.nagents, -1))

            # convert actions to numpy arrays
            agent_actions = [
                ac.data.numpy() for ac in torch_agent_actions.cpu()
            ]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, hid_to_store, agent_actions, rewards,
                               next_obs, next_hid_to_store, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                sample = replay_buffer.sample(config.batch_size,
                                              to_gpu=USE_CUDA)
                maddpg.update(sample, ep_i)
                maddpg.update_all_targets()
            rnn_hidden = new_rnn_hidden
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)
            print("Episode %i, reward for %i is " % (ep_i + 1, a_i), a_ep_rew)

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #6
0
def run(config):
    # model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     curr_run = 'run1'
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         curr_run = 'run1'
    #     else:
    #         curr_run = 'run%i' % (max(exst_run_nums) + 1)
    # run_dir = model_dir / curr_run
    # log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)
    # logger = SummaryWriter(str(log_dir))

    # torch.manual_seed(config.seed)
    # np.random.seed(config.seed)
    # if not USE_CUDA:
    #     torch.set_num_threads(config.n_training_threads)

    # transport configuration
    name = 'Materials Transport'
    conf = {
        'n_player': 2,  # 玩家数量
        'board_width': 11,  # 地图宽
        'board_height': 11,  # 地图高
        'n_cell_type': 5,  # 格子的种类
        'materials': 4,  # 集散点数量
        'cars': 2,  # 汽车数量
        'planes': 0,  # 飞机数量
        'barriers': 12,  # 固定障碍物数量
        'max_step': 50,  # 最大步数
        'game_name': name,  # 游戏名字
        'K': 5,  # 每个K局更新集散点物资数目
        'map_path': 'env/map.txt',  # 存放初始地图
        'cell_range': 6,  # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##?
        'ob_board_width': None,  # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##?
        'ob_board_height': None,  # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##?
        'ob_cell_range':
        None,  # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##?
    }

    env = make_parallel_env_transport(config.env_id, conf, config.seed,
                                      config.discrete_action)

    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))

    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    maddpg = MADDPG.init_from_save(model_path)
    maddpg.prep_rollouts(device='cpu')
    t = 0

    reward_epi = np.zeros(config.n_episodes)
    for ep_i in range(0, config.n_episodes):

        obs = env.reset()  # TODO: TO CHECK
        '''
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()
        '''
        reward_eti = 0
        pygame.init()
        screen = pygame.display.set_mode((440, 440))
        # pygame.display.set_caption(g.game_name)

        clock = pygame.time.Clock()
        for et_i in range(config.episode_length):

            # env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            # print('step', et_i)

            torch_obs = [
                Variable(
                    torch.Tensor(np.vstack(obs[:, i])),  # 沿着竖直方向将矩阵堆叠起来。
                    requires_grad=False) for i in range(maddpg.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(1)]
            print(actions)
            ############################################
            # add
            # actions = actions.astype(int)
            ############################################
            # add: 前两个action
            joint_action = []
            for i in range(2):
                player = []
                for j in range(1):
                    each = [0] * 11
                    idx = np.random.randint(11)
                    each[idx] = 1
                    player.append(each)
                joint_action.append(player)
            for m in range(2):
                joint_action.append([actions[0][m].astype(int).tolist()])

            next_obs, rewards, dones, infos = env.step(joint_action)
            obs = next_obs

            reward_eti += rewards[0][0]

            pygame.surfarray.blit_array(screen,
                                        env.render().transpose(1, 0, 2))
            pygame.display.flip()
            clock.tick(1)
            fname = "./image/" + str(et_i) + ".png"  # save image
            pygame.image.save(screen, fname)
def run(config):
    model_path = (Path('../models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    gif_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed'
    gif_path.mkdir(exist_ok=True)

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if config.mixed_policies:
        maddpg = MADDPG.init_from_directory(
            Path('../models') / config.env_id / config.model_name)
    else:
        maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id,
                   benchmark=True,
                   discrete_action=maddpg.discrete_action)
    env.seed(config.seed)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    all_infos = np.empty(
        (config.n_episodes, config.episode_length, maddpg.nagents, 10))
    n_movable_agents = sum([1 if a.movable else 0 for a in env.agents])
    n_speaking_agents = sum([0 if a.silent else 1 for a in env.agents])
    all_positions = np.zeros((config.n_episodes, config.episode_length,
                              n_movable_agents, env.world.dim_p))
    all_communications = np.zeros((config.n_episodes, config.episode_length,
                                   n_speaking_agents, env.world.dim_c))
    all_actions = np.zeros((config.n_episodes, config.episode_length,
                            len(env.agents), env.world.dim_c))
    obs_space = sum([obsp.shape[0] for obsp in env.observation_space])
    all_obs = np.zeros((config.n_episodes, config.episode_length, obs_space))

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        # env.agents[1].state.p_pos = np.array([0., 0.])
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]),
                                                      requires_grad=False)
                for i in range(maddpg.nagents)
            ]

            all_positions[ep_i, t_i] = env.get_positions()
            all_communications[ep_i, t_i] = env.get_communications()
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            # actions[0] = np.array([0., 0., 0., 0., 0.], dtype=np.float32)
            # actions[0][ep_i] = 1.
            obs, rewards, dones, infos = env.step(actions)

            all_actions[ep_i, t_i, :, :] = actions
            all_obs[ep_i, t_i, :] = np.concatenate(np.asarray(obs))

            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            if len(np.array(infos['n']).shape) < 4:
                all_infos[ep_i,
                          t_i, :, :len(infos['n'][-1])] = np.array(infos['n'])

    env.close()

    if config.save_stats:
        stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed'
        stats_path.mkdir(exist_ok=True)
        save(f'{stats_path}/all_infos.npy', all_infos)
        save(f'{stats_path}/all_positions.npy', all_positions)
        save(f'{stats_path}/all_communications.npy', all_communications)
        save(f'{stats_path}/all_actions.npy', all_actions)
        save(f'{stats_path}/all_observations.npy', all_obs)
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'


    print("\n"+str(model_path)+"\n\n\n")

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    #####################################################################################################
    #                                       START EPISODES                                              #
    #####################################################################################################

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        # For RNN history buffer
        obs_tminus_0 = copy(obs)
        obs_tminus_1 = copy(obs)
        obs_tminus_2 = copy(obs)

        obs_tminus_3 = copy(obs)
        obs_tminus_4 = copy(obs)
        obs_tminus_5 = copy(obs)

        # TODO: obs_history shape different from main.py, so parameterize it based on "obs"
        # It is different because main.py can run multiple threads, so has an extra dimension
        obs_history = np.empty([3,108])
        next_obs_history = np.empty([3,108])

        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')

        ##################################################################################################
        #                                       START TIME-STEPS                                         #
        ##################################################################################################

        for t_i in range(config.episode_length):

            # Populate current history for RNN
            for a in range(3):  # env.nagents
                    #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:]))
                    obs_history[a][:] = np.concatenate(
                        (obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:],
                         obs_tminus_3[a][:], obs_tminus_4[a][:], obs_tminus_5[a][:]))
                    # Now, temp has history of 6 timesteps for each agent

            calc_start = time.time()

            # rearrange observations to be per agent, and convert to torch Variable
            rnn_torch_obs = [Variable(torch.Tensor(obs_history[i]).view(1, -1),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_actions = maddpg.step(rnn_torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            next_obs, rewards, dones, infos = env.step(actions)

            # Update histories
            obs_tminus_5 = copy(obs_tminus_4)
            obs_tminus_4 = copy(obs_tminus_3)
            obs_tminus_3 = copy(obs_tminus_2)

            obs_tminus_2 = copy(obs_tminus_1)
            obs_tminus_1 = copy(obs_tminus_0)
            obs_tminus_0 = copy(next_obs)
            # --------------------------------------#

            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames, duration=ifi)

    env.close()
예제 #9
0
def run(config):
    original_model_path = (Path('./models') / config.env_id /
                           config.model_name / ('run%i' % config.run_num))
    # if config.incremental is not None:
    #     model_path = model_path / 'incremental' / ('model_ep%i.pt' %
    #                                                config.incremental)
    # else:
    #     model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = original_model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    # Model numbers in folder for stat runs
    rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001]
    stat_run_all_models = []

    for r in rrange:
        print("Model :" + str(r))
        model_path = original_model_path / 'incremental' / ('model_ep%i.pt' %
                                                            r)
        maddpg = MADDPG.init_from_save(model_path)
        env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
        maddpg.prep_rollouts(device='cpu')
        ifi = 1 / config.fps  # inter-frame interval

        stat_return_list = []
        for ep_i in range(config.n_episodes):
            print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
            obs = env.reset()
            if config.save_gifs:
                frames = []
                frames.append(env.render('rgb_array')[0])
            #env.render('human')

            episode_reward = 0
            for t_i in range(config.episode_length):
                calc_start = time.time()
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [
                    Variable(torch.Tensor(obs[i]).view(1, -1),
                             requires_grad=False)
                    for i in range(maddpg.nagents)
                ]
                # get actions as torch Variables
                torch_actions = maddpg.step(torch_obs, explore=False)
                # convert actions to numpy arrays
                actions = [ac.data.numpy().flatten() for ac in torch_actions]
                obs, rewards, dones, infos = env.step(actions)

                # get the global reward
                episode_reward += rewards[0][0]

                if config.save_gifs:
                    frames.append(env.render('rgb_array')[0])
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                #env.render('human')
            if config.save_gifs:
                gif_num = 0
                while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                    gif_num += 1
                imageio.mimsave(str(gif_path / ('%i_%i.gif' %
                                                (gif_num, ep_i))),
                                frames,
                                duration=ifi)
            # end of episodes (one-stat-run)
            stat_return_list.append(episode_reward / config.episode_length)
        # end of model
        stat_run_all_models.append(stat_return_list)
        env.close()

    pickling_on = open(str(original_model_path) + "/stat_runs", "wb")
    pkl.dump(stat_run_all_models, pickling_on)
    pickling_on.close()
예제 #10
0
    config = setup_experiment(args)
    logger = ExperimentLogger(config.save_dir, log_std_out=True, use_tensorboard=config.use_tensorboard)

    # make sampling runner  
    if not config.cuda:
        torch.set_num_threads(config.n_training_threads)
    env_func = ENV_MAP[config.env]
    p_env_func = partial(env_func, config.scenario, benchmark=False, 
                        show_visual_range=config.show_visual_range)
    env = make_parallel_env(p_env_func, config.env_config, config.n_rollout_threads, config.seed)
    if not config.no_eval:
        eval_env = env_func(config.scenario, benchmark=False, 
                        show_visual_range=config.show_visual_range, **config.env_config)

    # make learner agent 
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.max_buffer_size, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    
    # train loop 
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        logger.info("Episodes (%i-%i)/%i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
예제 #11
0
    see_runs = [0]
    wait = 0.05
    ep_len = 50

    for cur_run in see_runs:
        for i in range(4):
            config = Arglist()
            config.load_args(base_path / models_to_compare[cur_model] /
                             ("run" + str(cur_run)))
            env = make_parallel_env(config)
            model_path = base_path / models_to_compare[cur_model] / (
                "run" + str(cur_run)) / "model.pt"
            print(model_path)

            # add comm to action space:
            maddpg = MADDPG.init_from_save(model_path)
            # show some examples:
            obs = env.reset()
            # env.env._render("human", True)
            maddpg.prep_rollouts(device='cpu')

            # eval_model(maddpg, env, ep_len=100, num_steps=500, rollout_threads=1, display=True)
            for step in range(ep_len):
                env.env._render("human", False)
                time.sleep(wait)
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [
                    Variable(torch.Tensor(np.vstack(obs[:, i])),
                             requires_grad=False)
                    for i in range(maddpg.nagents)
                ]
예제 #12
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)
    
#     model_path = (Path('./models') / config.env_id / config.model_name /
#                   ('run%i' % config.run_num))
#     model_path = model_path / 'model.pt'
#     maddpg = MADDPG.init_runner_from_save(model_path)
    maddpg = MADDPG.init_from_env_with_delay(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  delay_step = 1)
    delay_step = 1
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] + delay_step*2 for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        
#         zero_agent_actions = [[[0, 0]].data.numpy() for _ in range(maddpg.nagents-1)]
        zero_agent_actions = [np.array([0.0, 0.0]) for _ in range(maddpg.nagents)]
        last_agent_actions = [zero_agent_actions for _ in range(delay_step)]
        for a_i, agent_obs in enumerate(obs[0]):
            for _ in range(len(last_agent_actions)):
                obs[0][a_i] = np.append(agent_obs, last_agent_actions[_][a_i])
                
        for et_i in range(config.episode_length):
#             print(obs)

#                     agent_obs = np.append(agent_obs, last_agent_actions[_][a_i])
#             print(np.concatenate(obs[0], np.array(last_agent_actions).T))
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # augment the obs
            # get actions as torch Variables
#             print(torch_obs)
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            
#             print(torch_agent_actions)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
#             print('1', agent_actions)
    #        actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # rearrange actions to be per environment
            if delay_step == 0:
                actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            else:
                agent_actions_tmp = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)][0]
#                 print('2', agent_actions_tmp)
                actions = last_agent_actions[0]
#                 print('3', actions)
#                 print('4', actions)
                last_agent_actions = last_agent_actions[1:]
                last_agent_actions.append(agent_actions_tmp)
#                 print('3', last_agent_actions)
#                 print('4', last_agent_actions)
#                 print('5', actions)
            actions = [actions]
            next_obs, rewards, dones, infos = env.step(actions)
#             print('6', actions)
            for a_i, agent_obs in enumerate(next_obs[0]):
                for _ in range(len(last_agent_actions)):
                    if a_i == 2:
                        next_obs[0][a_i] = np.append(agent_obs, 4*last_agent_actions[_][a_i])
                    else:
                        next_obs[0][a_i] = np.append(agent_obs, 3*last_agent_actions[_][a_i])
#             print('3', agent_actions)
            agent_actions[0] = agent_actions[0]*3
            agent_actions[1] = agent_actions[1]*3
            agent_actions[2] = agent_actions[1]*4
#             print('2',agent_actions)
#             print('4', obs)
#             print('5', next_obs)
#             print('1',agent_actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
    

            
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents - 1): #do not update the runner
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
#                     maddpg.update_all_targets()
                    maddpg.update_adversaries()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
            logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'reward': a_ep_rew}, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #13
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    if config.load_adv == True:
        model_path = (Path('./models') / config.env_id / config.model_name /
                      ('run%i' % config.run_num))
        model_path = model_path / 'model.pt'
        maddpg = MADDPG.init_from_env_with_runner_delay_unaware(
            env,
            agent_alg=config.agent_alg,
            adversary_alg=config.adversary_alg,
            tau=config.tau,
            lr=config.lr,
            hidden_dim=config.hidden_dim,
            file_name=model_path)
    else:
        maddpg = MADDPG.init_from_env(env,
                                      agent_alg=config.agent_alg,
                                      adversary_alg=config.adversary_alg,
                                      tau=config.tau,
                                      lr=config.lr,
                                      hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    delay_step = config.delay_step
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        maddpg.prep_rollouts(device='gpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        if config.env_id == 'simple_speaker_listener':
            zero_agent_actions = [
                np.array([[0, 0, 0]]),
                np.array([[0, 0, 0, 0, 0]])
            ]
        elif config.env_id == 'simple_spread':
            zero_agent_actions = [
                np.array([[0.0, 0.0, 0.0, 0.0, 0.0]])
                for _ in range(maddpg.nagents)
            ]
        elif config.env_id == 'simple_tag':
            zero_agent_actions = [
                np.array([0.0, 0.0]) for _ in range(maddpg.nagents)
            ]
        last_agent_actions = [zero_agent_actions for _ in range(delay_step)]

        for et_i in range(config.episode_length):
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            if config.load_adv:
                if delay_step == 0:
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                else:
                    agent_actions_tmp = [[
                        ac[i] for ac in agent_actions
                    ] for i in range(config.n_rollout_threads)][0][:]
                    actions = last_agent_actions[0]
                    actions.append(agent_actions_tmp[-1])
                    last_agent_actions = last_agent_actions[1:]
                    last_agent_actions.append(agent_actions_tmp[:2])
                actions = [actions]
                next_obs, rewards, dones, infos = env.step(
                    copy.deepcopy(actions))

            else:
                if delay_step == 0:
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                else:
                    actions = [[ac[i] for ac in last_agent_actions[0]]
                               for i in range(config.n_rollout_threads)]
                    last_agent_actions.pop(0)
                    last_agent_actions.append(agent_actions)

                next_obs, rewards, dones, infos = env.step(
                    copy.deepcopy(actions))
            print('1', obs, agent_actions, rewards, next_obs, dones)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    if config.load_adv:
                        for a_i in range(maddpg.nagents -
                                         1):  #do not update the runner
                            sample = replay_buffer.sample(config.batch_size,
                                                          to_gpu=USE_CUDA)
                            maddpg.update(sample, a_i, logger=logger)
    #                     maddpg.update_all_targets()
                        maddpg.update_adversaries()
                    else:
                        for a_i in range(
                                maddpg.nagents):  #do not update the runner
                            sample = replay_buffer.sample(config.batch_size,
                                                          to_gpu=USE_CUDA)
                            maddpg.update(sample, a_i, logger=logger)
                        maddpg.update_all_targets()
                maddpg.prep_rollouts(device='gpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalars('agent%i/mean_episode_rewards' % a_i,
                               {'reward': a_ep_rew}, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #14
0
def run(config):
    device = torch.device('cuda' if USE_CUDA else 'cpu')
    print('Using device:', device)
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    print(str(log_dir))
    logger = SummaryWriter(str(log_dir))
    #logger = None

    f = open(run_dir / "hyperparametrs.txt","w+")
    f.write(str(config))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action, config.benchmark)
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim, 
                                  stochastic = config.stochastic, 
                                  commonCritic = config.commonCritic, gasil = config.gasil, dlr = config.dlr, lambda_disc = config.lambda_disc,
                                  batch_size_disc = config.batch_size_disc, dynamic=config.dynamic)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    expert_replay_buffer = PriorityReplayBuffer(config.expert_buffer_length, config.episode_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    agent_info = [[[] for i in range(config.n_rollout_threads)]]
    reward_info = []
    total_returns = []
    eval_trajectories = []
    expert_average_returns = []
    trajectories = []
    durations = []
    start_time = time.time()
    expert_trajectories = []
    evaluation_rewards = []
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        if ep_i%100 == 0:
            mins = (time.time() - start_time)/60
            durations.append(mins)
            print(mins, "minutes")
            start_time = time.time()

        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()
        current_episode = [[] for i in range(config.n_rollout_threads)]
        current_trajectory = [[] for i in range(config.n_rollout_threads)]
        current_entities = []
        total_dense = None
        if config.store_traj:
            cur_state_ent = env.getStateEntities()
            for i in range(config.n_rollout_threads):
                current_entities.append(cur_state_ent[i])
           
            cur_state = env.getState()
            for i in range(config.n_rollout_threads):
                current_trajectory[i].append(cur_state[i])
        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            if config.store_traj:
                cur_state = env.getState()
                for i in range(config.n_rollout_threads):
                    current_trajectory[i].append(cur_state[i])

            for i in range(config.n_rollout_threads):
                current_episode[i].append([obs[i], actions[i]])
            
            if config.benchmark:
                #Fix this
                for i, info in enumerate(infos):
                    agent_info[-1][i].append(info['n'])

            if et_i == 0:
                total_dense = rewards
            else:
                total_dense = total_dense + rewards

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads and
                ((expert_replay_buffer.num_traj*config.episode_length >= config.batch_size_disc) == (maddpg.gasil))):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                if maddpg.gasil:
                    for update_i in range(config.num_disc_updates):
                        sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False)
                        
                        sample_expert = expert_replay_buffer.sample(config.batch_size_disc,
                                                    to_gpu=USE_CUDA)
                        maddpg.gasil_disc_update(sample_normal, sample_expert, 0, logger=logger, num_disc_permutations = config.num_disc_permutations)

                    for update_i in range(config.num_AC_updates):
                        sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False)
                        maddpg.gasil_AC_update(sample_normal, 0, episode_num = ep_i, logger=logger, num_AC_permutations = config.num_AC_permutations) 
                else:
                    for update_i in range(config.num_AC_updates):
                        sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False)
                        maddpg.update(sample_normal, 0, logger=logger, num_AC_permutations = config.num_AC_permutations)
                maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        total_returns.append(total_dense)
        if maddpg.gasil:
            expert_replay_buffer.push(current_episode, total_dense, config.n_rollout_threads, current_entities, current_trajectory, config.store_traj)
            expert_average_returns.append(expert_replay_buffer.get_average_return())
        
        if config.store_traj:
            for i in range(config.n_rollout_threads):
                trajectories.append([current_entities[i], current_trajectory[i]])

        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
           logger.add_scalars('agent%i/rew' % a_i,
                              {'mean_episode_rewards': a_ep_rew},
                              ep_i)
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
        
        #save mean episode rewards
        #save benchmarking data
        agent_info.append([[] for i in range(config.n_rollout_threads)])
        reward_info.append(ep_rews)
        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')
            #save the trajectories in the expert replay buffer 
            trajec = expert_replay_buffer.get_trajectories()
            if config.store_traj:
                expert_trajectories.append(trajec)
        
        if ep_i % config.eval_interval < config.n_rollout_threads:
            current_eval = []
            current_trajectories = []

            for ep_i_eval in range(0, config.n_eval_episodes, config.n_rollout_threads):
                obs = env.reset()
                total_eval = None
                maddpg.prep_rollouts(device='cpu')

                if config.store_traj:
                    current_trajectory = [[] for i in range(config.n_rollout_threads)]
                    current_entities = []
                    cur_state_ent = env.getStateEntities()
                    for i in range(config.n_rollout_threads):
                        current_entities.append(cur_state_ent[i])

                    cur_state = env.getState()
                    for i in range(config.n_rollout_threads):
                        current_trajectory[i].append(cur_state[i])

                for et_i in range(config.episode_length):
                    torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                        requires_grad=False)
                                for i in range(maddpg.nagents)]
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
                    actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)
                    if config.store_traj:
                        cur_state = env.getState()
                        for i in range(config.n_rollout_threads):
                            current_trajectory[i].append(cur_state[i])

                    
                    if et_i == 0:
                        total_eval = rewards
                    else:
                        total_eval = total_eval + rewards
                    obs = next_obs
                current_eval.append(total_eval)
                if config.store_traj:
                    for i in range(config.n_rollout_threads):
                        current_trajectories.append([current_entities[i], current_trajectory[i]])
            
            if config.store_traj:
                eval_trajectories.append(current_trajectories)
            evaluation_rewards.append(current_eval)
예제 #15
0
def run(config):
    model_dir = Path('./models') / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    env = gym.make("intersection-multiagent-v0")

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    delay_step = config.delay_step
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='gpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        agent_obs = []
        for i in range(4):
            agent_obs.append(
                np.array([
                    obs[i % 4], obs[(i + 1) % 4], obs[(i + 2) % 4],
                    obs[(i + 3) % 4]
                ]).flatten())
        obs = np.array([agent_obs])
        zero_agent_actions = [1, 1, 1, 1]
        last_agent_actions = [zero_agent_actions for _ in range(delay_step)]

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                torch.FloatTensor(np.vstack(obs[:, i]))
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            #             print(obs)
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # print(agent_actions)
            # rearrange actions to be per environment
            if delay_step == 0:
                actions = [np.argmax(agent_actions[i][0]) for i in range(4)]
            else:
                future_actions = [
                    np.argmax(agent_actions[i][0]) for i in range(4)
                ]
                actions = last_agent_actions[0]
                last_agent_actions = last_agent_actions[1:]
                last_agent_actions.append(future_actions)
            next_obs, rewards, dones, infos = env.step(actions)
            #             print(rewards)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            if dones[0][0]:
                break

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):

                    for a_i in range(
                            maddpg.nagents):  #do not update the runner
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                    maddpg.prep_rollouts(device='gpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
            logger.add_scalars('agent%i/mean_episode_rewards' % a_i,
                               {'reward': a_ep_rew}, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
def run(config):
    # Make directory to store the results
    model_dir = Path('./models')/config.env_id/config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)

    # initialize tensorboard summary writer
    logger = SummaryWriter(str(log_dir))

    # use provided seed
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    # IDK how helpful this is
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)

    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  )
    if not rnn:    # TODO: this might break. code might not be modular (yet). Code works with RNN
        replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                     [obsp.shape[0] for obsp in env.observation_space],
                                     [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                      for acsp in env.action_space])
    else:
        # replay buffer obs space size is increased
        rnn_replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                     [obsp.shape[0]*history_steps for obsp in env.observation_space],
                                     [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                      for acsp in env.action_space])

        # This is just to store the global rewards and not for updating the policies
        g_storage_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                        [obsp.shape[0]*history_steps for obsp in env.observation_space],
                                        [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                         for acsp in env.action_space])

    t = 0
    #####################################################################################################
    #                                       START EPISODES                                              #
    #####################################################################################################
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))

        # List of Observations for each of the agents
        # E.g., For simple_spread, shape is {1,3,18}
        obs = env.reset()

        # For RNN history buffer. I know this is not modular.
        obs_tminus_0 = copy(obs)
        obs_tminus_1 = copy(obs)
        obs_tminus_2 = copy(obs)
        obs_tminus_3 = copy(obs)
        obs_tminus_4 = copy(obs)
        obs_tminus_5 = copy(obs)

        # # for 3 time-steps
        # obs_history = np.empty([1,3,54])
        # next_obs_history = np.empty([1,3,54])

        # For 6 time-steps (18*3 = 54)
        obs_history = np.empty([1,3,108])
        next_obs_history = np.empty([1,3,108])

        maddpg.prep_rollouts(device='cpu')

        # Exploration percentage remaining. IDK if this is a standard way of doing it however.
        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        ##################################################################################################
        #                                       START TIME-STEPS                                         #
        ##################################################################################################

        for et_i in range(config.episode_length):

            # Populate current history
            for a in range(3):  # env.nagents
                obs_history[0][a][:] = np.concatenate((obs_tminus_0[0][a][:], obs_tminus_1[0][a][:], obs_tminus_2[0][a][:],
                                                      obs_tminus_3[0][a][:], obs_tminus_4[0][a][:], obs_tminus_5[0][a][:]))
                # Now, temp has history of 6 timesteps for each agent

            if not rnn:    # TODO: This might break. Code works with RNN. !RNN not tested.
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                      requires_grad=False)
                             for i in range(maddpg.nagents)]

                # get actions (from learning algorithm) as torch Variables. For simple_spread this is discrete[5]
                torch_agent_actions = maddpg.step(torch_obs, explore=True)

            else:
                # rearrange histories to be per agent, and convert to torch Variable
                rnn_torch_obs = [Variable(torch.Tensor(np.vstack(obs_history[:, i])),
                                      requires_grad=False)
                             for i in range(maddpg.nagents)]
                # TODO: for RNN, actions should condition on history (DONE)
                torch_agent_actions = maddpg.step(rnn_torch_obs, explore=True)


            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]    # print(torch_agent_actions[0].data)
            # rearrange actions to be per environment. For single thread, it wont really matter.
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            ############### WHICH REWARD TO USE ##############
            # the rewards now contain global as well as difference rewards
            # Keep the global for logging, and difference for updates

            use_diff_reward = False    #TODO: THIS IS THE TYPE OF REWARD YOU USE

            # DIFFERENCE REWARDS
            d_rewards = []
            for n in range(maddpg.nagents):
                d_rewards.append([rewards[0][n][1]])
            d_rewards = [d_rewards]
            d_rewards = np.array(d_rewards)

            # GLOBAL REWARDS
            g_rewards = []
            for n in range(maddpg.nagents):
                g_rewards.append([rewards[0][n][0]])
            g_rewards = [g_rewards]
            g_rewards = np.array(g_rewards)

            # replace "reward" with the reward that you want to use
            if use_diff_reward:
                rewards = d_rewards
            else:
                rewards = g_rewards

            # Create history for next state
            '''
            history is [t, t-1, t-2]
            history[0] is because [0] is for one thread
            '''
            for a in range(3):      # env.nagents
                next_obs_history[0][a][:] = np.concatenate((next_obs[0][a][:], obs_tminus_0[0][a][:], obs_tminus_1[0][a][:],
                                                            obs_tminus_2[0][a][:], obs_tminus_3[0][a][:], obs_tminus_4[0][a][:]))
                    # Now, next_obs_history has history of 6 timesteps for each agent the next state

            # for RNN, replay buffer needs to store for e.g., states=[obs_t-2, obs_t-1, obs_t]
            if not rnn:
                replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
                obs = next_obs
            else:
                # Buffer used for updates
                rnn_replay_buffer.push(obs_history, agent_actions, rewards, next_obs_history, dones)
                # push global rewards into g_replay_buffer
                g_storage_buffer.push(obs_history, agent_actions, g_rewards, next_obs_history, dones)

            # Update histories
            obs_tminus_5 = copy(obs_tminus_4)
            obs_tminus_4 = copy(obs_tminus_3)
            obs_tminus_3 = copy(obs_tminus_2)

            obs_tminus_2 = copy(obs_tminus_1)
            obs_tminus_1 = copy(obs_tminus_0)
            obs_tminus_0 = copy(next_obs)

            t += config.n_rollout_threads
            if (len(rnn_replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = rnn_replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        # For plotting, use global reward achieved using difference rewards
        ep_rews = g_storage_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()

    print()
예제 #17
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    env = make_env(config.env_id, discrete_action=False)
    if isinstance(env.action_space[0], Box):
        discr_act = False
        get_shape = lambda x: x.shape[0]
    else:  # Discrete
        discr_act = True
        get_shape = lambda x: x.n
    num_out_pol = get_shape(env.action_space[0])
    
    agent_init_params = {'num_in_pol': env.observation_space[0].shape[0],
                        'num_out_pol': num_out_pol,
                        'num_vars': 3}
    maddpg = MADDPG(agent_init_params, 
                    nagents = 3,
                    hidden_dim=config.hidden_dim,
                    discrete_action=discr_act)
    save_dict = torch.load(model_path)
    maddpg.agents.load_params(save_dict['agent_params'])
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')

        rnn_hidden = ( torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim), 
                        torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim) )
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions.cpu()]
            obs, rewards, dones, infos = env.step(actions)
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
            rnn_hidden = new_rnn_hidden
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames, duration=ifi)

    env.close()
예제 #18
0
        curr_run = 'run1'
    else:
        curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = model_dir / curr_run
log_dir = run_dir / 'logs'
os.makedirs(log_dir)
logger = SummaryWriter(str(log_dir))

torch.manual_seed(1024)
np.random.seed(1024)

env = make_parallel_env(env_id, n_rollout_threads, 1024, True)
maddpg = MADDPG.init_from_env(env,
                              agent_alg='MADDPG',
                              adversary_alg='MADDPG',
                              tau=0.01,
                              lr=0.01,
                              hidden_dim=64,
                              est_ac=True,
                              game_id='simple_speaker_listener')

replay_buffer = ReplayBuffer(
    buffer_length, maddpg.nagents,
    [obsp.shape[0] for obsp in env.observation_space], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in env.action_space
    ])

t = 0
#for ep_i in range(0, n_episodes, n_rollout_threads):
for ep_i in range(0, 10, 1):
    #print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes))
예제 #19
0
파일: main.py 프로젝트: yathartha3/DPP
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    ##################### INITIALIZE FROM SAVED? ###########################
    if init_from_saved:
        if model_path is not None:
            maddpg = MADDPG.init_from_save(model_path)
            print("Initialized from saved model")
    # -------------------------------------------------------------------- #
    else:
        maddpg = MADDPG.init_from_env(env,
                                      agent_alg=config.agent_alg,
                                      adversary_alg=config.adversary_alg,
                                      tau=config.tau,
                                      lr=config.lr,
                                      hidden_dim=config.hidden_dim)
    # used for learning (updates)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    # This is just to store the global rewards and not for updating the policies
    g_storage_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions, maddpg)
            '''
            Reward Shaping using D++, D.
            The rewards now contain global as well as shaped rewards
            Keep the global for logging, and use the shaped rewards for updates
            '''
            # Choose which reward to use
            use_dpp = True

            # DIFFERENCE REWARDS
            d_rewards = []
            for n in range(maddpg.nagents):
                d_rewards.append([rewards[0][n][1]])
            d_rewards = [d_rewards]
            d_rewards = np.array(d_rewards)

            # GLOBAL REWARDS
            g_rewards = []
            for n in range(maddpg.nagents):
                g_rewards.append([rewards[0][n][0]])
            g_rewards = [g_rewards]
            g_rewards = np.array(g_rewards)

            if use_dpp:
                rewards = d_rewards
            else:
                rewards = g_rewards
            # ----------------------------------------------------------- #
            # Buffer used for updates
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            # push global rewards into g_replay_buffer for plotting
            g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs,
                                  dones)

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        # Take out global reward from g_storage_buffer
        ep_rews = g_storage_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)

        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #20
0
with open(config_file, 'r') as f:
    args_dict = yaml.safe_load(f)
    args_dict['n_agents'] = args_dict['n_pursuers']
    # config = yaml.load(f, Loader=loader)
    args = SN(**args_dict)

if args.seed is not False:
    th.manual_seed(args.seed)

env = Env(**args_dict)

if args.seed is not False:
    env.seed(args.seed)

for m in pool_list:
    maddpg = MADDPG.init_from_save(result_folder + '/model/' + m + '.pt', with_cpu=True)
    maddpg.prep_rollouts(device='cpu')
    with th.no_grad():
        total_reward = 0.
        test_time = 5
        for it in range(test_time):
            l = []
            obs = env.reset()
            l.append(env.render(gui=True))
            obs = np.stack(obs, axis=0)
            obs = th.from_numpy(obs).float()
            print('----------')
            reward_it = 0.
            for t in range(args.test_max_steps):
                obs = obs.type(th.FloatTensor)
                actions = maddpg.step(obs, explore=False)
예제 #21
0
def run(config):
    model_dir = Path('./models') / config.env_name / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    os.system("cp shape.txt {}".format(run_dir))
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    #training时的线程数
    if not config.use_cuda:
        torch.set_num_threads(config.n_training_threads)

    #env并行采样的进程

    env = make_parallel_env(config.num_agents, config.n_rollout_threads,
                            run_num, config.shape_file)
    #'''
    maddpg = MADDPG.init_from_env(env=env,
                                  agent_alg=config.agent_alg,
                                  cripple_alg=config.cripple_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  discrete_action=config.discrete_action)
    #'''
    #maddpg = MADDPG.init_from_save(model_dir/'run1'/'model.pt')

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    a_loss = []
    c_loss = []
    rewss = []

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))

        obs = env.reset()

        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')  # show for the first time

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        #if config.display:
        #    for env_show in env.envs:
        #        env_show.render('human', close=False)

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]

            #actions = [np.array([i.tolist().index(1.0) for i in action]) for action in actions_one_hot]

            for i in actions:
                #    print(i)
                for j in i:
                    j[1] *= np.pi
            #print(actions[0])

            next_obs, rewards, dones, infos = env.step(actions)

            #print(len(agent_actions),len(next_obs))
            #if config.display:
            #    for env_show in env.envs:
            #        env_show.render('human', close=False)

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                #print(t)
                if config.use_cuda:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=config.use_cuda,
                                                      norm_rews=True)
                        maddpg.update(sample,
                                      a_i,
                                      logger=logger,
                                      actor_loss_list=a_loss,
                                      critic_loss_list=c_loss)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        rewss.append(ep_rews)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)
            # print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            maddpg.save(
                str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))))
            maddpg.save(str(run_dir / 'model.pt'))
    maddpg.save(str(run_dir / 'model.pt'))
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    '''
예제 #22
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    #logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)

    if(env=='simple_reference'):
        for i in range(2):
            agent_init_params.append({'num_in_pol': num_in_pol,
                                          'num_out_pol': num_out_pol,
                                          'num_in_critic': num_in_critic})
            
            init_dict = {'gamma': gamma, 'tau': tau, 'lr': lr,
                         'hidden_dim': hidden_dim,
                         'alg_types': alg_types,
                         'agent_init_params': agent_init_params,
                         'discrete_action': discrete_action}

    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0

    episode_average_rewards=[]
    hundred_episode_average_rewards=[]

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):

        if (ep_i%100==0 and ep_i>0):
            hundred_episode_average_rewards.append(np.mean(episode_average_rewards))
            print('Rewards till',ep_i,'=',hundred_episode_average_rewards[-1])
            print('Agent Actions=',torch_agent_actions)
            episode_average_rewards=[]
        '''
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        '''
        obs = env.reset()

        rewards_for_this_episode=[]
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            rewards_for_this_episode.append(np.mean(rewards))

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i)#, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
            
            if ep_i>10000:
                print('Goal Color=',torch_obs[0])
                print('Communication=',agent_actions[0])
            
                env.render()
                time.sleep(0.01)


        if ep_i>100000:
            import ipdb
            ipdb.set_trace()

        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        
        episode_average_rewards.append(np.sum(rewards_for_this_episode))
        #for a_i, a_ep_rew in enumerate(ep_rews):
            #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    plt.plot(100*np.array(range(1,config.n_episodes//100)),hundred_episode_average_rewards)
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward for 100 episodes')
    plt.title('Speaker Discrete and Mover Continuous')
    plt.show('plot.png')

    maddpg.save(run_dir / 'model.pt')
    env.close()
예제 #23
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)
    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  noisy_sharing=True,
                                  noisy_SNR=config.noisy_SNR,
                                  game_id=config.env_id,
                                  est_ac=config.est_action)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    print(
        '#########################################################################'
    )
    print('Adversary using: ', config.adversary_alg, 'Good agent using: ',
          config.agent_alg, '\n')
    print('Noisy SNR is: ', config.noisy_SNR)
    print(
        '#########################################################################'
    )
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')
        if ep_i % 5000 == 0:
            maddpg.lr *= 0.5
        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()

                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            print("Episodes %i-%i of %i, rewards are: \n" %
                  (ep_i + 1, ep_i + 1 + config.n_rollout_threads,
                   config.n_episodes))
            for a_i, a_ep_rew in enumerate(ep_rews):
                print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        # *** perform validation every 1000 episodes. i.e. run N=10 times without exploration ***
        if ep_i % config.validate_every_n_eps == config.validate_every_n_eps - 1:
            # 假设只有一个env在跑
            episodes_stats = []
            info_for_one_env_among_timesteps = []
            print('*' * 10, 'Validation BEGINS', '*' * 10)
            for valid_et_i in range(config.run_n_eps_in_validation):
                obs = env.reset()
                maddpg.prep_rollouts(device='cpu')
                explr_pct_remaining = max(0, config.n_exploration_eps -
                                          ep_i) / config.n_exploration_eps
                maddpg.scale_noise(
                    config.final_noise_scale +
                    (config.init_noise_scale - config.final_noise_scale) *
                    explr_pct_remaining)
                maddpg.reset_noise()

                curr_episode_stats = []
                for et_i in range(config.episode_length):
                    # rearrange observations to be per agent, and convert to torch Variable
                    torch_obs = [
                        Variable(torch.Tensor(np.vstack(obs[:, i])),
                                 requires_grad=False)
                        for i in range(maddpg.nagents)
                    ]
                    # get actions as torch Variables
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    # convert actions to numpy arrays
                    agent_actions = [
                        ac.data.numpy() for ac in torch_agent_actions
                    ]
                    # rearrange actions to be per environment
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)

                    info_for_one_env_among_timesteps.append(infos[0]['n'])

                    curr_episode_stats.append(infos[0]['n'])

                    obs = next_obs
                episodes_stats.append(curr_episode_stats)

            print('Summary statistics:')
            if config.env_id == 'simple_tag':
                # avg_collisions = sum(map(sum,info_for_one_env_among_timesteps))/config.run_n_eps_in_validation
                episodes_stats = np.array(episodes_stats)
                # print(episodes_stats.shape)
                # validation logging
                with open(f'{config.model_name}.log', 'a') as valid_logfile:
                    valid_logwriter = csv.writer(valid_logfile, delimiter=' ')
                    valid_logwriter.writerow(
                        np.sum(episodes_stats, axis=(1, 2)).tolist())
                avg_collisions = np.sum(
                    episodes_stats) / episodes_stats.shape[0]
                print(f'Avg of collisions: {avg_collisions}')

            elif config.env_id == 'simple_speaker_listener':
                for i, stat in enumerate(info_for_one_env_among_timesteps):
                    print(f'ep {i}: {stat}')
            else:
                raise NotImplementedError
            print('*' * 10, 'Validation ENDS', '*' * 10)

        # *** END of VALIDATION ***

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    valid_logfile.close()
예제 #24
0
파일: main.py 프로젝트: xuezzee/-
def run(config):
    scores_window = deque(maxlen=100)

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    # transport configuration
    name = 'Materials Transport'
    conf = {
        'n_player': 2,  #玩家数量
        'board_width': 11,  #地图宽
        'board_height': 11,  #地图高
        'n_cell_type': 5,  #格子的种类
        'materials': 4,  #集散点数量
        'cars': 2,  #汽车数
        'planes': 0,  #飞机数量
        'barriers': 12,  #固定障碍物数量
        'max_step': 500,  #最大步数
        'game_name': name,  #游戏名字
        'K': 5,  #每个K局更新集散点物资数目
        'map_path': 'env/map.txt',  #存放初始地图
        'cell_range': 6,  # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##?
        'ob_board_width': None,  # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##?
        'ob_board_height': None,  # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##?
        'ob_cell_range':
        None,  # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##?
    }

    env = make_parallel_env_transport(config.env_id, conf,
                                      config.n_rollout_threads, config.seed,
                                      config.discrete_action)

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        score = 0
        # print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                 ep_i + 1 + config.n_rollout_threads,
        #                                 config.n_episodes))

        obs = env.reset()  # TODO: TO CHECK
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # print('step', et_i)
            # env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            # print('step', et_i)
            # print(maddpg.nagents)
            torch_obs = [
                Variable(
                    torch.Tensor(np.vstack(obs[:, i])),  # 沿着竖直方向将矩阵堆叠起来。
                    requires_grad=False) for i in range(maddpg.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            ############################################
            # add
            # actions = actions.astype(int)
            ############################################
            # add: 前两个action
            joint_action = []

            for i in range(2):
                player = []
                for j in range(1):
                    each = [0] * 11
                    # idx = np.random.randint(11)
                    each[3] = 1
                    player.append(each)
                joint_action.append(player)
            for m in range(2):
                joint_action.append([actions[0][m].astype(int).tolist()])

            next_obs, rewards, dones, infos = env.step(joint_action)

            #################################
            agents_action = actions[0]
            #################################

            replay_buffer.push(obs, agents_action, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')

            score += rewards[0][0]

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        scores_window.append(score)
        reward_epi = np.mean(scores_window)
        reward_epi_var = np.var(scores_window)
        logger.add_scalar('results/completion_window' % reward_epi, ep_i)
        logger.add_scalar('results/completion_window' % reward_epi_var, ep_i)
        print(
            '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '.
            format(ep_i, reward_epi, reward_epi_var))

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #25
0
def run(args, **args_dict):
    reward_flag, pos_flag = None, None
    save_data = {'reward': -1000., 'pos': 0.}
    # model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     curr_run = 'run1'
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         curr_run = 'run1'
    #     else:
    #         curr_run = 'run%i' % (max(exst_run_nums) + 1)
    # run_dir = model_dir / curr_run
    # log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)

    th.manual_seed(args.seed)
    np.random.seed(args.seed)
    if not args.use_cuda or not th.cuda.is_available():
        # th.set_num_threads(args.n_training_threads)
        FloatTensor = th.FloatTensor
    else:
        FloatTensor = th.cuda.FloatTensor
    env = make_parallel_env(**args_dict)
    maddpg = MADDPG.init_from_env(env, args)
    replay_buffer = ReplayBuffer(
        args.capacity, args.n_agents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, args.n_episodes, args.n_rollout_threads):
        ttt = time.time()
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        if args.use_cuda and th.cuda.is_available():
            maddpg.prep_rollouts(device='gpu')
        else:
            maddpg.prep_rollouts(device='cpu')
        # maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, args.n_exploration_eps - ep_i) / args.n_exploration_eps
        scale_noise_i = args.final_noise_scale + (
            args.init_noise_scale -
            args.final_noise_scale) * explr_pct_remaining
        maddpg.scale_noise(scale_noise_i)
        maddpg.reset_noise()

        print("Episodes %i-%i of %i, replay: %.2f, explore: %.2f" %
              (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes,
               float(len(replay_buffer)) / replay_buffer.max_steps,
               scale_noise_i))

        for et_i in range(args.max_steps):
            ttt = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                th.from_numpy(np.vstack(obs[:, i])).type(FloatTensor)
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [
                ac.detach().cpu().numpy() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(args.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += args.n_rollout_threads
            #
            # ttt2 = time.time()
            # print('1', ttt2 - ttt)
            #
            if (len(replay_buffer) >= args.batch_size
                    and (t % args.steps_per_update) < args.n_rollout_threads):
                ttt = time.time()
                if args.use_cuda and th.cuda.is_available():
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                # for u_i in range(args.n_rollout_threads):
                for a_i in range(maddpg.nagents):
                    sample = replay_buffer.sample(args.batch_size,
                                                  to_gpu=args.use_cuda
                                                  and th.cuda.is_available(),
                                                  norm_rews=args.norm_rews)
                    _, _, _ = maddpg.update(sample, a_i)
                maddpg.update_all_targets()
                if args.use_cuda and th.cuda.is_available():
                    maddpg.prep_rollouts(device='gpu')
                else:
                    maddpg.prep_rollouts(device='cpu')
                # maddpg.prep_rollouts(device='cpu')
                #
                # ttt2 = time.time()
                # print('2', ttt2 - ttt)
                #
        # ep_rews = replay_buffer.get_average_rewards(
        #     config.episode_length * config.n_rollout_threads)
        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % args.test_interval < args.n_rollout_threads:
            ttt = time.time()
            obs = env.reset()
            if args.use_cuda and th.cuda.is_available():
                maddpg.prep_rollouts(device='gpu')
            else:
                maddpg.prep_rollouts(device='cpu')
            # maddpg.prep_rollouts(device='cpu')
            with th.no_grad():
                pos_total = 0.
                finish_ep = np.zeros(args.n_rollout_threads)
                r_total = np.zeros((args.n_rollout_threads, args.n_agents))
                record_r = np.zeros(args.n_agents)
                for eval_i in range(args.max_steps):
                    torch_obs = [
                        FloatTensor(np.vstack(obs[:, i]))
                        for i in range(maddpg.nagents)
                    ]
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    agent_actions = [
                        ac.detach().cpu().numpy() for ac in torch_agent_actions
                    ]
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(args.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)
                    r_total += rewards
                    obs = next_obs
                    for d_i in range(dones.shape[0]):
                        if dones[d_i] or (eval_i == args.max_steps - 1
                                          and finish_ep[d_i] == 0.):
                            # if eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.:
                            #     print(d_i)
                            pos_total += infos[d_i]['pos']
                            record_r += r_total[d_i]
                            r_total[d_i] = [0., 0.]
                            finish_ep[d_i] += 1
                record_r /= finish_ep.sum()
                pos_total /= finish_ep.sum()

                # ttt2 = time.time()
                # print('3', ttt2 - ttt)
                #

                new_path = model_path + '/' + str(ep_i) + '.pt'
                has_saved = False
                if record_r.sum() > save_data['reward']:
                    save_data['reward'] = record_r.sum()
                    if save_data['reward'] > 0 and pos_total > 10.:
                        # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
                        maddpg.save(new_path)
                if pos_total > save_data['pos']:
                    save_data['pos'] = pos_total
                    if record_r.sum(
                    ) > 0 and pos_total > 10. and not has_saved:
                        # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
                        maddpg.save(new_path)
                if pos_total > 17.0:
                    maddpg.save(new_path)

                if reward_flag is None:
                    reward_flag = vis.line(
                        X=np.arange(ep_i, ep_i + 1),
                        Y=np.array([np.append(record_r, record_r.sum())]),
                        opts=dict(ylabel='Test Reward',
                                  xlabel='Episode',
                                  title='Reward',
                                  legend=[
                                      'Agent-%d' % i
                                      for i in range(args.n_agents)
                                  ] + ['Total']))
                else:
                    vis.line(X=np.array(
                        [np.array(ep_i).repeat(args.n_agents + 1)]),
                             Y=np.array([np.append(record_r, record_r.sum())]),
                             win=reward_flag,
                             update='append')

                if pos_flag is None:
                    pos_flag = vis.line(X=np.arange(ep_i, ep_i + 1),
                                        Y=np.array([pos_total]),
                                        opts=dict(ylabel='Length',
                                                  xlabel='Episode',
                                                  title='How far ?',
                                                  legend=['position']))
                else:
                    vis.line(X=np.array([ep_i]),
                             Y=np.array([pos_total]),
                             win=pos_flag,
                             update='append')
        # if ep_i % config.save_interval < config.n_rollout_threads:
        #     os.makedirs(run_dir / 'incremental', exist_ok=True)
        #     maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
        #     maddpg.save(run_dir / 'model.pt')

    # maddpg.save(run_dir / 'model.pt')
    env.close()
예제 #26
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    shape_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num) / config.shape_file)
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    maddpg = MADDPG.init_from_save(model_path)
    #env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
    env=HeavyObjectEnv( num_agents=config.num_agents,shape_file=shape_path)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render()
        for t_i in range(config.episode_length):
            calc_start = time.time()

            if t_i ==15:
                env.change_centroid(0.3,0.3)
                print("change centroid!")

            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions_one_hot = [ac.data.numpy().flatten() for ac in torch_actions]
            #print(actions_one_hot)
            actions = np.array([i.tolist().index(1.0) for i in actions_one_hot])
            #print(actions,len(actions))
            #print(env._state,env._last_value)
            print(t_i)

            #for j in actions:
            #    j[1]*=np.pi
            #print(actions,"new")
            obs, rewards, dones, infos = env.step(actions)
            #print(dones)
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render()
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames, duration=ifi)

    env.close()
예제 #27
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs' if not config.mixed_policies else model_path.parent / 'gifs_mixed'
        gif_path.mkdir(exist_ok=True)
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if config.mixed_policies:
        maddpg = MADDPG.init_from_directory(
            Path('./models') / config.env_id / config.model_name)
    else:
        maddpg = MADDPG.init_from_save(model_path)
    env = make_env(config.env_id,
                   benchmark=True,
                   discrete_action=maddpg.discrete_action)
    env.world.seed(config.seed)
    maddpg.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval
    all_infos = np.empty(
        (config.n_episodes, config.episode_length, maddpg.nagents, 10))
    all_positions = np.zeros(
        (config.n_episodes, config.episode_length, maddpg.nagents, 2))
    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]),
                                                      requires_grad=False)
                for i in range(maddpg.nagents)
            ]

            all_positions[ep_i, t_i] = env.get_positions()
            # get actions as torch Variables
            torch_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)

            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
                # frames.append(env.world.viewers[0].render(return_rgb_array = True)) uncomment if local views visible
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
            if len(np.array(infos['n']).shape) < 4:
                all_infos[ep_i,
                          t_i, :, :len(infos['n'][-1])] = np.array(infos['n'])

        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames,
                            duration=ifi)

    env.close()

    if config.save_stats:
        stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed'
        stats_path.mkdir(exist_ok=True)
        save(f'{stats_path}/all_infos.npy', all_infos)
        save(f'{stats_path}/all_positions.npy', all_positions)
예제 #28
0
def run(config):
    original_model_path = (Path('./models') / config.env_id /
                           config.model_name / ('run%i' % config.run_num))
    # if config.incremental is not None:
    #     model_path = model_path / 'incremental' / ('model_ep%i.pt' %
    #                                                config.incremental)
    # else:
    #     model_path = model_path / 'model.pt'
    #
    # print(model_path)

    ###########################################################################
    #                      FORCE MODEL PATH                                   #
    ###########################################################################
    model_path_list = []
    rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001]

    # FOR EACH MODEL, DO STATISTICAL RUNS
    # for r in rrange:
    #     model_path = model_path / 'incremental' / ('model_ep%i.pt' % r)

    ######################  SAVING STAT RUNS FOR EACH MODEL ###################
    stat_run_all_models = []

    for r in rrange:
        model_path = original_model_path / 'incremental' / ('model_ep%i.pt' %
                                                            r)
        if config.save_gifs:
            gif_path = model_path.parent / 'gifs'
            gif_path.mkdir(exist_ok=True)

        maddpg = MADDPG.init_from_save(model_path)
        env = make_env(config.env_id, discrete_action=maddpg.discrete_action)
        maddpg.prep_rollouts(device='cpu')
        ifi = 1 / config.fps  # inter-frame interval

        #####################################################################################################
        #                             CONFIGURATION FOR STATISTICAL RUNS (EPISODES)
        #####################################################################################################
        #####################################################################################################
        #                                       START EPISODES                                              #
        #####################################################################################################
        stat_return_list = []
        for ep_i in range(config.n_episodes):  # number of stat runs
            print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
            obs = env.reset()
            # For RNN history buffer
            obs_tminus_0 = copy(obs)
            obs_tminus_1 = copy(obs)
            obs_tminus_2 = copy(obs)
            obs_tminus_3 = copy(obs)
            obs_tminus_4 = copy(obs)
            obs_tminus_5 = copy(obs)

            # TODO: obs_history shape different from main.py, so parameterize it based on "obs"
            # It is different because main.py can run multiple threads, so has an extra dimension
            obs_history = np.empty([3, 108])
            next_obs_history = np.empty([3, 108])

            if config.save_gifs:
                frames = []
                frames.append(env.render('rgb_array')[0])
            #env.render('human')

            ##################################################################################################
            #                                       START TIME-STEPS                                         #
            ##################################################################################################
            episode_reward = 0
            for t_i in range(config.episode_length):

                # Populate current history for RNN
                for a in range(3):  # env.nagents
                    #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:]))
                    obs_history[a][:] = np.concatenate(
                        (obs_tminus_0[a][:], obs_tminus_1[a][:],
                         obs_tminus_2[a][:], obs_tminus_3[a][:],
                         obs_tminus_4[a][:], obs_tminus_5[a][:]))
                    # Now, temp has history of 6 timesteps for each agent

                calc_start = time.time()

                # rearrange observations to be per agent, and convert to torch Variable
                rnn_torch_obs = [
                    Variable(torch.Tensor(obs_history[i]).view(1, -1),
                             requires_grad=False)
                    for i in range(maddpg.nagents)
                ]
                # get actions as torch Variables
                torch_actions = maddpg.step(rnn_torch_obs, explore=False)
                # convert actions to numpy arrays
                actions = [ac.data.numpy().flatten() for ac in torch_actions]
                next_obs, rewards, dones, infos = env.step(actions)

                # get the global reward
                episode_reward += rewards[0][0]

                # Update histories
                obs_tminus_5 = copy(obs_tminus_4)
                obs_tminus_4 = copy(obs_tminus_3)
                obs_tminus_3 = copy(obs_tminus_2)
                obs_tminus_2 = copy(obs_tminus_1)
                obs_tminus_1 = copy(obs_tminus_0)
                obs_tminus_0 = copy(next_obs)
                # --------------------------------------#

                if config.save_gifs:
                    frames.append(env.render('rgb_array')[0])
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                #env.render('human')
                # end of an episode

            if config.save_gifs:
                gif_num = 0
                while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                    gif_num += 1
                imageio.mimsave(str(gif_path / ('%i_%i.gif' %
                                                (gif_num, ep_i))),
                                frames,
                                duration=ifi)
            # end of episodes (one stat-run)
            stat_return_list.append(episode_reward / config.episode_length)
        # end of model
        stat_run_all_models.append(stat_return_list)
        env.close()

    pickling_on = open(str(original_model_path) + "/stat_runs", "wb")
    pkl.dump(stat_run_all_models, pickling_on)
    pickling_on.close()
예제 #29
0
파일: transport_main.py 프로젝트: xuezzee/-
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #30
0
def run_main(run_num):
    config = Arglist()
    run_manager = running_env_manager(MODE)
    run_manager.prep_running_env(config, run_num)

    if not config.USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config)
    eval_env = make_parallel_env(config)

    maddpg = MADDPG.init_from_env(env, config)
    if config.use_IL:
        IL_controller = IL_Controller(config)  # imitation learning controller
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp['comm'].n + acsp['act'].n if config.discrete_action else acsp['comm'].n +
                                                                                                acsp['act'].shape[0]
                                  for acsp in env.action_space])
    t = 0
    # reset test results arrays
    all_ep_rewards = []
    mean_ep_rewards = []
    start_time = time.time()
    step = 0
    win_counter = 0
    curr_ep = -1
    eval_win_rates = [0]
    # eps_without_IL = 0
    # eps_without_IL_hist = []
    print("\nPrey Max Speed: {}, useIL is {}\n".format(config.prey_max_speed, config.use_IL))
    while step < config.n_time_steps:  # total steps to be performed during a single run
        # start a episode due to episode termination\done
        curr_ep += 1
        ep_rewards = np.zeros((1, len(env.agent_types)))  # init reward vec for single episode.

        # prepare episodic stuff
        obs = env.reset()
        # maddpg.prep_rollouts(device=config.device)
        maddpg.prep_rollouts(device=config.device)
        explr_pct_remaining = max(0, config.n_exploration_steps - step) / config.n_exploration_steps
        maddpg.scale_noise(
            config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()
        for ep_step in range(config.episode_length):  # 1 episode loop. ends due to term\done
            # env.env._render("human", False)
            # time.sleep(0.05)
            if step == config.n_time_steps: break
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, ind])),
                                  requires_grad=False)
                         for ind in range(maddpg.nagents)]
            # get actions as torch Variables
            with torch.no_grad():
                torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            # agent_actions = [ac.detach().cpu().data.numpy() for ac in torch_agent_actions]
            agent_actions = [ac.cpu().data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[idx] for ac in agent_actions] for idx in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            if (len(replay_buffer) >= config.batch_size and
                    (step % config.steps_per_eval) < config.n_rollout_threads):  # perform evaluation
                eval_win_rates.append(eval_model(maddpg, eval_env, config.episode_length, config.num_steps_in_eval,
                                                 config.n_rollout_threads, display=False))

            if (len(replay_buffer) >= config.batch_size and
                    (step % config.steps_per_update) < config.n_rollout_threads):  # perform training
                train_model(maddpg, config, replay_buffer)

            step += config.n_rollout_threads  # advance the step-counter

            if (len(replay_buffer) >= config.batch_size and config.use_IL and
                    (step % config.IL_inject_every) < config.n_rollout_threads):  # perform IL injection
                step, eval_win_rates = \
                    IL_controller.IL_inject(maddpg, replay_buffer, eval_env, step, config, eval_win_rates)
                IL_controller.decay()

            ep_rewards += rewards
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)

            if dones.any():  # terminate episode if won! #
                win_counter += 1
                # eps_without_IL += 1
                break
            obs = next_obs
        # perform IL injection if failed
        # if config.use_IL and ep_step == config.episode_length-1 and not dones.any():
        #     step, eval_win_rates = \
        #         IL_controller.IL_inject(maddpg, replay_buffer, eval_env, step, config, eval_win_rates)
        #     eps_without_IL_hist.append(eps_without_IL)
        #     eps_without_IL = 0

        mean_ep_rewards.append(ep_rewards / config.episode_length)
        all_ep_rewards.append(ep_rewards)

        if step % 100 == 0 or (step == config.n_time_steps):  # print progress.
            run_manager.printProgressBar(step, start_time, config.n_time_steps, "run" + str(run_num) + ": Steps Done: ",
                                         " Last eval win rate: {0:.2%}".format(eval_win_rates[-1]), 20, "%")

        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        # if ep_i % config.save_interval < config.n_rollout_threads:
        #     os.makedirs(run_dir / 'incremental', exist_ok=True)
        #     maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
        #     maddpg.save(run_dir / 'model.pt')

    # eps_without_IL_hist.append(eps_without_IL)
    if MODE == "RUN":
        run_dir = run_manager.run_dir
        np.save(run_dir / 'episodes_rewards', {"tot_ep_rewards": all_ep_rewards.copy(),
                                               "mean_ep_rewards": mean_ep_rewards.copy()}, True)
        # np.save(run_dir / 'IL_hist', eps_without_IL_hist, True)
        np.save(run_dir / 'win_rates', eval_win_rates, True)
        maddpg.save(run_dir / 'model.pt')
    # env.close()
    # logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    # logger.close()
    return run_num