예제 #1
0
def init_from_config(env, config, logger):
    """
    Inits the algorithm from a config dict, handles DDPG-like and SAC-like models but not
    a mixture of them (some agents are DDPG and some others are SAC).
    :param env:
    :param config:
    :param logger:
    :return: algorithm
    """
    sup_algos = config.agent_alg in SUPPORTED_ALGOS

    # Initializes agents
    if sup_algos:
        algorithm = MADDPG.init_from_env(env,
                                         agent_alg=config.agent_alg,
                                         adversary_alg=config.adversary_alg,
                                         tau=config.tau,
                                         gamma=config.gamma,
                                         lr=config.lr,
                                         lr_fe_coef=config.lr_fe_coef,
                                         lr_critic_coef=config.lr_critic_coef,
                                         grad_clip_value=config.grad_clip_value,
                                         hidden_dim=config.hidden_dim,
                                         weight_decay=config.weight_decay,
                                         discrete_exploration_scheme=config.discrete_exploration_scheme,
                                         boltzmann_temperature=config.boltzmann_temperature,
                                         feature_extractor=config.feature_extractor,
                                         logger=logger)
    else:
        raise ValueError('Algo is not supported')

    return algorithm
예제 #2
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)
    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  noisy_sharing=True,
                                  noisy_SNR=config.noisy_SNR,
                                  game_id=config.env_id,
                                  est_ac=config.est_action)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    print(
        '#########################################################################'
    )
    print('Adversary using: ', config.adversary_alg, 'Good agent using: ',
          config.agent_alg, '\n')
    print('Noisy SNR is: ', config.noisy_SNR)
    print(
        '#########################################################################'
    )
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')
        if ep_i % 5000 == 0:
            maddpg.lr *= 0.5
        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()

                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            print("Episodes %i-%i of %i, rewards are: \n" %
                  (ep_i + 1, ep_i + 1 + config.n_rollout_threads,
                   config.n_episodes))
            for a_i, a_ep_rew in enumerate(ep_rews):
                print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        # *** perform validation every 1000 episodes. i.e. run N=10 times without exploration ***
        if ep_i % config.validate_every_n_eps == config.validate_every_n_eps - 1:
            # 假设只有一个env在跑
            episodes_stats = []
            info_for_one_env_among_timesteps = []
            print('*' * 10, 'Validation BEGINS', '*' * 10)
            for valid_et_i in range(config.run_n_eps_in_validation):
                obs = env.reset()
                maddpg.prep_rollouts(device='cpu')
                explr_pct_remaining = max(0, config.n_exploration_eps -
                                          ep_i) / config.n_exploration_eps
                maddpg.scale_noise(
                    config.final_noise_scale +
                    (config.init_noise_scale - config.final_noise_scale) *
                    explr_pct_remaining)
                maddpg.reset_noise()

                curr_episode_stats = []
                for et_i in range(config.episode_length):
                    # rearrange observations to be per agent, and convert to torch Variable
                    torch_obs = [
                        Variable(torch.Tensor(np.vstack(obs[:, i])),
                                 requires_grad=False)
                        for i in range(maddpg.nagents)
                    ]
                    # get actions as torch Variables
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    # convert actions to numpy arrays
                    agent_actions = [
                        ac.data.numpy() for ac in torch_agent_actions
                    ]
                    # rearrange actions to be per environment
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)

                    info_for_one_env_among_timesteps.append(infos[0]['n'])

                    curr_episode_stats.append(infos[0]['n'])

                    obs = next_obs
                episodes_stats.append(curr_episode_stats)

            print('Summary statistics:')
            if config.env_id == 'simple_tag':
                # avg_collisions = sum(map(sum,info_for_one_env_among_timesteps))/config.run_n_eps_in_validation
                episodes_stats = np.array(episodes_stats)
                # print(episodes_stats.shape)
                # validation logging
                with open(f'{config.model_name}.log', 'a') as valid_logfile:
                    valid_logwriter = csv.writer(valid_logfile, delimiter=' ')
                    valid_logwriter.writerow(
                        np.sum(episodes_stats, axis=(1, 2)).tolist())
                avg_collisions = np.sum(
                    episodes_stats) / episodes_stats.shape[0]
                print(f'Avg of collisions: {avg_collisions}')

            elif config.env_id == 'simple_speaker_listener':
                for i, stat in enumerate(info_for_one_env_among_timesteps):
                    print(f'ep {i}: {stat}')
            else:
                raise NotImplementedError
            print('*' * 10, 'Validation ENDS', '*' * 10)

        # *** END of VALIDATION ***

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    valid_logfile.close()
예제 #3
0
    config = setup_experiment(args)
    logger = ExperimentLogger(config.save_dir, log_std_out=True, use_tensorboard=config.use_tensorboard)

    # make sampling runner  
    if not config.cuda:
        torch.set_num_threads(config.n_training_threads)
    env_func = ENV_MAP[config.env]
    p_env_func = partial(env_func, config.scenario, benchmark=False, 
                        show_visual_range=config.show_visual_range)
    env = make_parallel_env(p_env_func, config.env_config, config.n_rollout_threads, config.seed)
    if not config.no_eval:
        eval_env = env_func(config.scenario, benchmark=False, 
                        show_visual_range=config.show_visual_range, **config.env_config)

    # make learner agent 
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.max_buffer_size, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    
    # train loop 
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        logger.info("Episodes (%i-%i)/%i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
예제 #4
0
def run(config):
    model_dir = Path('./models') / config.env_name / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    os.system("cp shape.txt {}".format(run_dir))
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    #training时的线程数
    if not config.use_cuda:
        torch.set_num_threads(config.n_training_threads)

    #env并行采样的进程

    env = make_parallel_env(config.num_agents, config.n_rollout_threads,
                            run_num, config.shape_file)
    #'''
    maddpg = MADDPG.init_from_env(env=env,
                                  agent_alg=config.agent_alg,
                                  cripple_alg=config.cripple_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  discrete_action=config.discrete_action)
    #'''
    #maddpg = MADDPG.init_from_save(model_dir/'run1'/'model.pt')

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    a_loss = []
    c_loss = []
    rewss = []

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))

        obs = env.reset()

        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')  # show for the first time

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        #if config.display:
        #    for env_show in env.envs:
        #        env_show.render('human', close=False)

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]

            #actions = [np.array([i.tolist().index(1.0) for i in action]) for action in actions_one_hot]

            for i in actions:
                #    print(i)
                for j in i:
                    j[1] *= np.pi
            #print(actions[0])

            next_obs, rewards, dones, infos = env.step(actions)

            #print(len(agent_actions),len(next_obs))
            #if config.display:
            #    for env_show in env.envs:
            #        env_show.render('human', close=False)

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                #print(t)
                if config.use_cuda:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=config.use_cuda,
                                                      norm_rews=True)
                        maddpg.update(sample,
                                      a_i,
                                      logger=logger,
                                      actor_loss_list=a_loss,
                                      critic_loss_list=c_loss)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        rewss.append(ep_rews)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)
            # print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            maddpg.save(
                str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))))
            maddpg.save(str(run_dir / 'model.pt'))
    maddpg.save(str(run_dir / 'model.pt'))
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    '''
예제 #5
0
        curr_run = 'run1'
    else:
        curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = model_dir / curr_run
log_dir = run_dir / 'logs'
os.makedirs(log_dir)
logger = SummaryWriter(str(log_dir))

torch.manual_seed(1024)
np.random.seed(1024)

env = make_parallel_env(env_id, n_rollout_threads, 1024, True)
maddpg = MADDPG.init_from_env(env,
                              agent_alg='MADDPG',
                              adversary_alg='MADDPG',
                              tau=0.01,
                              lr=0.01,
                              hidden_dim=64,
                              est_ac=True,
                              game_id='simple_speaker_listener')

replay_buffer = ReplayBuffer(
    buffer_length, maddpg.nagents,
    [obsp.shape[0] for obsp in env.observation_space], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in env.action_space
    ])

t = 0
#for ep_i in range(0, n_episodes, n_rollout_threads):
for ep_i in range(0, 10, 1):
    #print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes))
예제 #6
0
파일: main.py 프로젝트: yathartha3/DPP
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    ##################### INITIALIZE FROM SAVED? ###########################
    if init_from_saved:
        if model_path is not None:
            maddpg = MADDPG.init_from_save(model_path)
            print("Initialized from saved model")
    # -------------------------------------------------------------------- #
    else:
        maddpg = MADDPG.init_from_env(env,
                                      agent_alg=config.agent_alg,
                                      adversary_alg=config.adversary_alg,
                                      tau=config.tau,
                                      lr=config.lr,
                                      hidden_dim=config.hidden_dim)
    # used for learning (updates)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    # This is just to store the global rewards and not for updating the policies
    g_storage_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions, maddpg)
            '''
            Reward Shaping using D++, D.
            The rewards now contain global as well as shaped rewards
            Keep the global for logging, and use the shaped rewards for updates
            '''
            # Choose which reward to use
            use_dpp = True

            # DIFFERENCE REWARDS
            d_rewards = []
            for n in range(maddpg.nagents):
                d_rewards.append([rewards[0][n][1]])
            d_rewards = [d_rewards]
            d_rewards = np.array(d_rewards)

            # GLOBAL REWARDS
            g_rewards = []
            for n in range(maddpg.nagents):
                g_rewards.append([rewards[0][n][0]])
            g_rewards = [g_rewards]
            g_rewards = np.array(g_rewards)

            if use_dpp:
                rewards = d_rewards
            else:
                rewards = g_rewards
            # ----------------------------------------------------------- #
            # Buffer used for updates
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            # push global rewards into g_replay_buffer for plotting
            g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs,
                                  dones)

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        # Take out global reward from g_storage_buffer
        ep_rews = g_storage_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)

        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #7
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    #logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)

    if(env=='simple_reference'):
        for i in range(2):
            agent_init_params.append({'num_in_pol': num_in_pol,
                                          'num_out_pol': num_out_pol,
                                          'num_in_critic': num_in_critic})
            
            init_dict = {'gamma': gamma, 'tau': tau, 'lr': lr,
                         'hidden_dim': hidden_dim,
                         'alg_types': alg_types,
                         'agent_init_params': agent_init_params,
                         'discrete_action': discrete_action}

    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0

    episode_average_rewards=[]
    hundred_episode_average_rewards=[]

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):

        if (ep_i%100==0 and ep_i>0):
            hundred_episode_average_rewards.append(np.mean(episode_average_rewards))
            print('Rewards till',ep_i,'=',hundred_episode_average_rewards[-1])
            print('Agent Actions=',torch_agent_actions)
            episode_average_rewards=[]
        '''
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        '''
        obs = env.reset()

        rewards_for_this_episode=[]
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            rewards_for_this_episode.append(np.mean(rewards))

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i)#, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
            
            if ep_i>10000:
                print('Goal Color=',torch_obs[0])
                print('Communication=',agent_actions[0])
            
                env.render()
                time.sleep(0.01)


        if ep_i>100000:
            import ipdb
            ipdb.set_trace()

        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        
        episode_average_rewards.append(np.sum(rewards_for_this_episode))
        #for a_i, a_ep_rew in enumerate(ep_rews):
            #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    plt.plot(100*np.array(range(1,config.n_episodes//100)),hundred_episode_average_rewards)
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward for 100 episodes')
    plt.title('Speaker Discrete and Mover Continuous')
    plt.show('plot.png')

    maddpg.save(run_dir / 'model.pt')
    env.close()
예제 #8
0
def run(config):
    # model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    run_num = 10
    # else:
    #    exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                     model_dir.iterdir() if
    #                     str(folder.name).startswith('run')]
    #    if len(exst_run_nums) == 0:
    #        run_num = 1
    #    else:
    #        run_num = max(exst_run_nums) + 1
    # curr_run = 'run%i' % run_num
    # run_dir = model_dir / curr_run
    # log_dir = run_dir / 'logs'
    log_dir = 'checkpoints0605_4/'
    run_dir = log_dir + 'logs/'
    # os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    # change env
    env = SCraftAdapter(map_name='3m', seed=123,step_mul=8, difficulty='7', game_version='latest', replay_dir="replay/")
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp for obsp in env.observation_space],
                                 [acsp for acsp in env.action_space])
    t = 0

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        stop = False
        obs = env.reset()
        # if stop:
        #     stop = False
        #     obs = env.reset()
        # else:
        #     obs=env._get_obs()

        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(
            config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        episode_length = 0
        while not stop:
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)


            actions=[]
            agent_actions=np.zeros([len(env.action_space),env.action_space[0]])
            # add avail_agent
            avail_actions=np.array(env.get_avail_actions())
            for agent_i in range(len(torch_agent_actions)):
                agent_action = env.get_avail_agent_actions(agent_i)
                agent_action=[0 if agent_action[i]==0 else torch_agent_actions[agent_i].data.numpy()[0][i] for i in range(len(agent_action))]
            # add argmax
                actions.append(np.argmax(agent_action))
            # new actions
                agent_actions[agent_i][actions[agent_i]]=1


            # torch_agent_actions=[(if agent_avail_actions  for action in ac.data.numpy()) for ac in torch_agent_actions]
            # convert actions to numpy arrays

            # rearrange actions to be per environment

            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            stop = dones[0][0]
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones,avail_actions)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                    (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
            episode_length += 1
        ep_rews = replay_buffer.get_average_rewards(
            episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

    #     if ep_i % config.save_interval < config.n_rollout_threads:
    #         os.makedirs(run_dir / 'incremental', exist_ok=True)
    #         maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
    #         maddpg.save(run_dir / 'model.pt')
    #
    # maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir, '/summary.json'))
    logger.close()
예제 #9
0
def run(config):
    device = torch.device('cuda' if USE_CUDA else 'cpu')
    print('Using device:', device)
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    print(str(log_dir))
    logger = SummaryWriter(str(log_dir))
    #logger = None

    f = open(run_dir / "hyperparametrs.txt","w+")
    f.write(str(config))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action, config.benchmark)
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim, 
                                  stochastic = config.stochastic, 
                                  commonCritic = config.commonCritic, gasil = config.gasil, dlr = config.dlr, lambda_disc = config.lambda_disc,
                                  batch_size_disc = config.batch_size_disc, dynamic=config.dynamic)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    expert_replay_buffer = PriorityReplayBuffer(config.expert_buffer_length, config.episode_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    agent_info = [[[] for i in range(config.n_rollout_threads)]]
    reward_info = []
    total_returns = []
    eval_trajectories = []
    expert_average_returns = []
    trajectories = []
    durations = []
    start_time = time.time()
    expert_trajectories = []
    evaluation_rewards = []
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        if ep_i%100 == 0:
            mins = (time.time() - start_time)/60
            durations.append(mins)
            print(mins, "minutes")
            start_time = time.time()

        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()
        current_episode = [[] for i in range(config.n_rollout_threads)]
        current_trajectory = [[] for i in range(config.n_rollout_threads)]
        current_entities = []
        total_dense = None
        if config.store_traj:
            cur_state_ent = env.getStateEntities()
            for i in range(config.n_rollout_threads):
                current_entities.append(cur_state_ent[i])
           
            cur_state = env.getState()
            for i in range(config.n_rollout_threads):
                current_trajectory[i].append(cur_state[i])
        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            if config.store_traj:
                cur_state = env.getState()
                for i in range(config.n_rollout_threads):
                    current_trajectory[i].append(cur_state[i])

            for i in range(config.n_rollout_threads):
                current_episode[i].append([obs[i], actions[i]])
            
            if config.benchmark:
                #Fix this
                for i, info in enumerate(infos):
                    agent_info[-1][i].append(info['n'])

            if et_i == 0:
                total_dense = rewards
            else:
                total_dense = total_dense + rewards

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads and
                ((expert_replay_buffer.num_traj*config.episode_length >= config.batch_size_disc) == (maddpg.gasil))):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                if maddpg.gasil:
                    for update_i in range(config.num_disc_updates):
                        sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False)
                        
                        sample_expert = expert_replay_buffer.sample(config.batch_size_disc,
                                                    to_gpu=USE_CUDA)
                        maddpg.gasil_disc_update(sample_normal, sample_expert, 0, logger=logger, num_disc_permutations = config.num_disc_permutations)

                    for update_i in range(config.num_AC_updates):
                        sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False)
                        maddpg.gasil_AC_update(sample_normal, 0, episode_num = ep_i, logger=logger, num_AC_permutations = config.num_AC_permutations) 
                else:
                    for update_i in range(config.num_AC_updates):
                        sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False)
                        maddpg.update(sample_normal, 0, logger=logger, num_AC_permutations = config.num_AC_permutations)
                maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        total_returns.append(total_dense)
        if maddpg.gasil:
            expert_replay_buffer.push(current_episode, total_dense, config.n_rollout_threads, current_entities, current_trajectory, config.store_traj)
            expert_average_returns.append(expert_replay_buffer.get_average_return())
        
        if config.store_traj:
            for i in range(config.n_rollout_threads):
                trajectories.append([current_entities[i], current_trajectory[i]])

        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
           logger.add_scalars('agent%i/rew' % a_i,
                              {'mean_episode_rewards': a_ep_rew},
                              ep_i)
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
        
        #save mean episode rewards
        #save benchmarking data
        agent_info.append([[] for i in range(config.n_rollout_threads)])
        reward_info.append(ep_rews)
        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')
            #save the trajectories in the expert replay buffer 
            trajec = expert_replay_buffer.get_trajectories()
            if config.store_traj:
                expert_trajectories.append(trajec)
        
        if ep_i % config.eval_interval < config.n_rollout_threads:
            current_eval = []
            current_trajectories = []

            for ep_i_eval in range(0, config.n_eval_episodes, config.n_rollout_threads):
                obs = env.reset()
                total_eval = None
                maddpg.prep_rollouts(device='cpu')

                if config.store_traj:
                    current_trajectory = [[] for i in range(config.n_rollout_threads)]
                    current_entities = []
                    cur_state_ent = env.getStateEntities()
                    for i in range(config.n_rollout_threads):
                        current_entities.append(cur_state_ent[i])

                    cur_state = env.getState()
                    for i in range(config.n_rollout_threads):
                        current_trajectory[i].append(cur_state[i])

                for et_i in range(config.episode_length):
                    torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                        requires_grad=False)
                                for i in range(maddpg.nagents)]
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
                    actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)
                    if config.store_traj:
                        cur_state = env.getState()
                        for i in range(config.n_rollout_threads):
                            current_trajectory[i].append(cur_state[i])

                    
                    if et_i == 0:
                        total_eval = rewards
                    else:
                        total_eval = total_eval + rewards
                    obs = next_obs
                current_eval.append(total_eval)
                if config.store_traj:
                    for i in range(config.n_rollout_threads):
                        current_trajectories.append([current_entities[i], current_trajectory[i]])
            
            if config.store_traj:
                eval_trajectories.append(current_trajectories)
            evaluation_rewards.append(current_eval)
예제 #10
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    if config.load_adv == True:
        model_path = (Path('./models') / config.env_id / config.model_name /
                      ('run%i' % config.run_num))
        model_path = model_path / 'model.pt'
        maddpg = MADDPG.init_from_env_with_runner_delay_unaware(
            env,
            agent_alg=config.agent_alg,
            adversary_alg=config.adversary_alg,
            tau=config.tau,
            lr=config.lr,
            hidden_dim=config.hidden_dim,
            file_name=model_path)
    else:
        maddpg = MADDPG.init_from_env(env,
                                      agent_alg=config.agent_alg,
                                      adversary_alg=config.adversary_alg,
                                      tau=config.tau,
                                      lr=config.lr,
                                      hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    delay_step = config.delay_step
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        maddpg.prep_rollouts(device='gpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        if config.env_id == 'simple_speaker_listener':
            zero_agent_actions = [
                np.array([[0, 0, 0]]),
                np.array([[0, 0, 0, 0, 0]])
            ]
        elif config.env_id == 'simple_spread':
            zero_agent_actions = [
                np.array([[0.0, 0.0, 0.0, 0.0, 0.0]])
                for _ in range(maddpg.nagents)
            ]
        elif config.env_id == 'simple_tag':
            zero_agent_actions = [
                np.array([0.0, 0.0]) for _ in range(maddpg.nagents)
            ]
        last_agent_actions = [zero_agent_actions for _ in range(delay_step)]

        for et_i in range(config.episode_length):
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            if config.load_adv:
                if delay_step == 0:
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                else:
                    agent_actions_tmp = [[
                        ac[i] for ac in agent_actions
                    ] for i in range(config.n_rollout_threads)][0][:]
                    actions = last_agent_actions[0]
                    actions.append(agent_actions_tmp[-1])
                    last_agent_actions = last_agent_actions[1:]
                    last_agent_actions.append(agent_actions_tmp[:2])
                actions = [actions]
                next_obs, rewards, dones, infos = env.step(
                    copy.deepcopy(actions))

            else:
                if delay_step == 0:
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                else:
                    actions = [[ac[i] for ac in last_agent_actions[0]]
                               for i in range(config.n_rollout_threads)]
                    last_agent_actions.pop(0)
                    last_agent_actions.append(agent_actions)

                next_obs, rewards, dones, infos = env.step(
                    copy.deepcopy(actions))
            print('1', obs, agent_actions, rewards, next_obs, dones)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    if config.load_adv:
                        for a_i in range(maddpg.nagents -
                                         1):  #do not update the runner
                            sample = replay_buffer.sample(config.batch_size,
                                                          to_gpu=USE_CUDA)
                            maddpg.update(sample, a_i, logger=logger)
    #                     maddpg.update_all_targets()
                        maddpg.update_adversaries()
                    else:
                        for a_i in range(
                                maddpg.nagents):  #do not update the runner
                            sample = replay_buffer.sample(config.batch_size,
                                                          to_gpu=USE_CUDA)
                            maddpg.update(sample, a_i, logger=logger)
                        maddpg.update_all_targets()
                maddpg.prep_rollouts(device='gpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalars('agent%i/mean_episode_rewards' % a_i,
                               {'reward': a_ep_rew}, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #11
0
def run(config):
    model_dir = Path('./models') / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    env = gym.make("intersection-multiagent-v0")

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    delay_step = config.delay_step
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='gpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        agent_obs = []
        for i in range(4):
            agent_obs.append(
                np.array([
                    obs[i % 4], obs[(i + 1) % 4], obs[(i + 2) % 4],
                    obs[(i + 3) % 4]
                ]).flatten())
        obs = np.array([agent_obs])
        zero_agent_actions = [1, 1, 1, 1]
        last_agent_actions = [zero_agent_actions for _ in range(delay_step)]

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                torch.FloatTensor(np.vstack(obs[:, i]))
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            #             print(obs)
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # print(agent_actions)
            # rearrange actions to be per environment
            if delay_step == 0:
                actions = [np.argmax(agent_actions[i][0]) for i in range(4)]
            else:
                future_actions = [
                    np.argmax(agent_actions[i][0]) for i in range(4)
                ]
                actions = last_agent_actions[0]
                last_agent_actions = last_agent_actions[1:]
                last_agent_actions.append(future_actions)
            next_obs, rewards, dones, infos = env.step(actions)
            #             print(rewards)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            if dones[0][0]:
                break

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):

                    for a_i in range(
                            maddpg.nagents):  #do not update the runner
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                    maddpg.prep_rollouts(device='gpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
            logger.add_scalars('agent%i/mean_episode_rewards' % a_i,
                               {'reward': a_ep_rew}, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
def run(config):
    # Make directory to store the results
    model_dir = Path('./models')/config.env_id/config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)

    # initialize tensorboard summary writer
    logger = SummaryWriter(str(log_dir))

    # use provided seed
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    # IDK how helpful this is
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)

    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  )
    if not rnn:    # TODO: this might break. code might not be modular (yet). Code works with RNN
        replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                     [obsp.shape[0] for obsp in env.observation_space],
                                     [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                      for acsp in env.action_space])
    else:
        # replay buffer obs space size is increased
        rnn_replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                     [obsp.shape[0]*history_steps for obsp in env.observation_space],
                                     [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                      for acsp in env.action_space])

        # This is just to store the global rewards and not for updating the policies
        g_storage_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                        [obsp.shape[0]*history_steps for obsp in env.observation_space],
                                        [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                         for acsp in env.action_space])

    t = 0
    #####################################################################################################
    #                                       START EPISODES                                              #
    #####################################################################################################
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))

        # List of Observations for each of the agents
        # E.g., For simple_spread, shape is {1,3,18}
        obs = env.reset()

        # For RNN history buffer. I know this is not modular.
        obs_tminus_0 = copy(obs)
        obs_tminus_1 = copy(obs)
        obs_tminus_2 = copy(obs)
        obs_tminus_3 = copy(obs)
        obs_tminus_4 = copy(obs)
        obs_tminus_5 = copy(obs)

        # # for 3 time-steps
        # obs_history = np.empty([1,3,54])
        # next_obs_history = np.empty([1,3,54])

        # For 6 time-steps (18*3 = 54)
        obs_history = np.empty([1,3,108])
        next_obs_history = np.empty([1,3,108])

        maddpg.prep_rollouts(device='cpu')

        # Exploration percentage remaining. IDK if this is a standard way of doing it however.
        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        ##################################################################################################
        #                                       START TIME-STEPS                                         #
        ##################################################################################################

        for et_i in range(config.episode_length):

            # Populate current history
            for a in range(3):  # env.nagents
                obs_history[0][a][:] = np.concatenate((obs_tminus_0[0][a][:], obs_tminus_1[0][a][:], obs_tminus_2[0][a][:],
                                                      obs_tminus_3[0][a][:], obs_tminus_4[0][a][:], obs_tminus_5[0][a][:]))
                # Now, temp has history of 6 timesteps for each agent

            if not rnn:    # TODO: This might break. Code works with RNN. !RNN not tested.
                # rearrange observations to be per agent, and convert to torch Variable
                torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                      requires_grad=False)
                             for i in range(maddpg.nagents)]

                # get actions (from learning algorithm) as torch Variables. For simple_spread this is discrete[5]
                torch_agent_actions = maddpg.step(torch_obs, explore=True)

            else:
                # rearrange histories to be per agent, and convert to torch Variable
                rnn_torch_obs = [Variable(torch.Tensor(np.vstack(obs_history[:, i])),
                                      requires_grad=False)
                             for i in range(maddpg.nagents)]
                # TODO: for RNN, actions should condition on history (DONE)
                torch_agent_actions = maddpg.step(rnn_torch_obs, explore=True)


            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]    # print(torch_agent_actions[0].data)
            # rearrange actions to be per environment. For single thread, it wont really matter.
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            ############### WHICH REWARD TO USE ##############
            # the rewards now contain global as well as difference rewards
            # Keep the global for logging, and difference for updates

            use_diff_reward = False    #TODO: THIS IS THE TYPE OF REWARD YOU USE

            # DIFFERENCE REWARDS
            d_rewards = []
            for n in range(maddpg.nagents):
                d_rewards.append([rewards[0][n][1]])
            d_rewards = [d_rewards]
            d_rewards = np.array(d_rewards)

            # GLOBAL REWARDS
            g_rewards = []
            for n in range(maddpg.nagents):
                g_rewards.append([rewards[0][n][0]])
            g_rewards = [g_rewards]
            g_rewards = np.array(g_rewards)

            # replace "reward" with the reward that you want to use
            if use_diff_reward:
                rewards = d_rewards
            else:
                rewards = g_rewards

            # Create history for next state
            '''
            history is [t, t-1, t-2]
            history[0] is because [0] is for one thread
            '''
            for a in range(3):      # env.nagents
                next_obs_history[0][a][:] = np.concatenate((next_obs[0][a][:], obs_tminus_0[0][a][:], obs_tminus_1[0][a][:],
                                                            obs_tminus_2[0][a][:], obs_tminus_3[0][a][:], obs_tminus_4[0][a][:]))
                    # Now, next_obs_history has history of 6 timesteps for each agent the next state

            # for RNN, replay buffer needs to store for e.g., states=[obs_t-2, obs_t-1, obs_t]
            if not rnn:
                replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
                obs = next_obs
            else:
                # Buffer used for updates
                rnn_replay_buffer.push(obs_history, agent_actions, rewards, next_obs_history, dones)
                # push global rewards into g_replay_buffer
                g_storage_buffer.push(obs_history, agent_actions, g_rewards, next_obs_history, dones)

            # Update histories
            obs_tminus_5 = copy(obs_tminus_4)
            obs_tminus_4 = copy(obs_tminus_3)
            obs_tminus_3 = copy(obs_tminus_2)

            obs_tminus_2 = copy(obs_tminus_1)
            obs_tminus_1 = copy(obs_tminus_0)
            obs_tminus_0 = copy(next_obs)

            t += config.n_rollout_threads
            if (len(rnn_replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = rnn_replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        # For plotting, use global reward achieved using difference rewards
        ep_rews = g_storage_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()

    print()
예제 #13
0
def run_main(run_num):
    config = Arglist()
    run_manager = running_env_manager(MODE)
    run_manager.prep_running_env(config, run_num)

    if not config.USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config)
    eval_env = make_parallel_env(config)

    maddpg = MADDPG.init_from_env(env, config)
    if config.use_IL:
        IL_controller = IL_Controller(config)  # imitation learning controller
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp['comm'].n + acsp['act'].n if config.discrete_action else acsp['comm'].n +
                                                                                                acsp['act'].shape[0]
                                  for acsp in env.action_space])
    t = 0
    # reset test results arrays
    all_ep_rewards = []
    mean_ep_rewards = []
    start_time = time.time()
    step = 0
    win_counter = 0
    curr_ep = -1
    eval_win_rates = [0]
    # eps_without_IL = 0
    # eps_without_IL_hist = []
    print("\nPrey Max Speed: {}, useIL is {}\n".format(config.prey_max_speed, config.use_IL))
    while step < config.n_time_steps:  # total steps to be performed during a single run
        # start a episode due to episode termination\done
        curr_ep += 1
        ep_rewards = np.zeros((1, len(env.agent_types)))  # init reward vec for single episode.

        # prepare episodic stuff
        obs = env.reset()
        # maddpg.prep_rollouts(device=config.device)
        maddpg.prep_rollouts(device=config.device)
        explr_pct_remaining = max(0, config.n_exploration_steps - step) / config.n_exploration_steps
        maddpg.scale_noise(
            config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()
        for ep_step in range(config.episode_length):  # 1 episode loop. ends due to term\done
            # env.env._render("human", False)
            # time.sleep(0.05)
            if step == config.n_time_steps: break
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, ind])),
                                  requires_grad=False)
                         for ind in range(maddpg.nagents)]
            # get actions as torch Variables
            with torch.no_grad():
                torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            # agent_actions = [ac.detach().cpu().data.numpy() for ac in torch_agent_actions]
            agent_actions = [ac.cpu().data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[idx] for ac in agent_actions] for idx in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            if (len(replay_buffer) >= config.batch_size and
                    (step % config.steps_per_eval) < config.n_rollout_threads):  # perform evaluation
                eval_win_rates.append(eval_model(maddpg, eval_env, config.episode_length, config.num_steps_in_eval,
                                                 config.n_rollout_threads, display=False))

            if (len(replay_buffer) >= config.batch_size and
                    (step % config.steps_per_update) < config.n_rollout_threads):  # perform training
                train_model(maddpg, config, replay_buffer)

            step += config.n_rollout_threads  # advance the step-counter

            if (len(replay_buffer) >= config.batch_size and config.use_IL and
                    (step % config.IL_inject_every) < config.n_rollout_threads):  # perform IL injection
                step, eval_win_rates = \
                    IL_controller.IL_inject(maddpg, replay_buffer, eval_env, step, config, eval_win_rates)
                IL_controller.decay()

            ep_rewards += rewards
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)

            if dones.any():  # terminate episode if won! #
                win_counter += 1
                # eps_without_IL += 1
                break
            obs = next_obs
        # perform IL injection if failed
        # if config.use_IL and ep_step == config.episode_length-1 and not dones.any():
        #     step, eval_win_rates = \
        #         IL_controller.IL_inject(maddpg, replay_buffer, eval_env, step, config, eval_win_rates)
        #     eps_without_IL_hist.append(eps_without_IL)
        #     eps_without_IL = 0

        mean_ep_rewards.append(ep_rewards / config.episode_length)
        all_ep_rewards.append(ep_rewards)

        if step % 100 == 0 or (step == config.n_time_steps):  # print progress.
            run_manager.printProgressBar(step, start_time, config.n_time_steps, "run" + str(run_num) + ": Steps Done: ",
                                         " Last eval win rate: {0:.2%}".format(eval_win_rates[-1]), 20, "%")

        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        # if ep_i % config.save_interval < config.n_rollout_threads:
        #     os.makedirs(run_dir / 'incremental', exist_ok=True)
        #     maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
        #     maddpg.save(run_dir / 'model.pt')

    # eps_without_IL_hist.append(eps_without_IL)
    if MODE == "RUN":
        run_dir = run_manager.run_dir
        np.save(run_dir / 'episodes_rewards', {"tot_ep_rewards": all_ep_rewards.copy(),
                                               "mean_ep_rewards": mean_ep_rewards.copy()}, True)
        # np.save(run_dir / 'IL_hist', eps_without_IL_hist, True)
        np.save(run_dir / 'win_rates', eval_win_rates, True)
        maddpg.save(run_dir / 'model.pt')
    # env.close()
    # logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    # logger.close()
    return run_num
예제 #14
0
파일: main.py 프로젝트: xuezzee/-
def run(config):
    scores_window = deque(maxlen=100)

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    # transport configuration
    name = 'Materials Transport'
    conf = {
        'n_player': 2,  #玩家数量
        'board_width': 11,  #地图宽
        'board_height': 11,  #地图高
        'n_cell_type': 5,  #格子的种类
        'materials': 4,  #集散点数量
        'cars': 2,  #汽车数
        'planes': 0,  #飞机数量
        'barriers': 12,  #固定障碍物数量
        'max_step': 500,  #最大步数
        'game_name': name,  #游戏名字
        'K': 5,  #每个K局更新集散点物资数目
        'map_path': 'env/map.txt',  #存放初始地图
        'cell_range': 6,  # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##?
        'ob_board_width': None,  # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##?
        'ob_board_height': None,  # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##?
        'ob_cell_range':
        None,  # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##?
    }

    env = make_parallel_env_transport(config.env_id, conf,
                                      config.n_rollout_threads, config.seed,
                                      config.discrete_action)

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        score = 0
        # print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                 ep_i + 1 + config.n_rollout_threads,
        #                                 config.n_episodes))

        obs = env.reset()  # TODO: TO CHECK
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # print('step', et_i)
            # env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            # print('step', et_i)
            # print(maddpg.nagents)
            torch_obs = [
                Variable(
                    torch.Tensor(np.vstack(obs[:, i])),  # 沿着竖直方向将矩阵堆叠起来。
                    requires_grad=False) for i in range(maddpg.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            ############################################
            # add
            # actions = actions.astype(int)
            ############################################
            # add: 前两个action
            joint_action = []

            for i in range(2):
                player = []
                for j in range(1):
                    each = [0] * 11
                    # idx = np.random.randint(11)
                    each[3] = 1
                    player.append(each)
                joint_action.append(player)
            for m in range(2):
                joint_action.append([actions[0][m].astype(int).tolist()])

            next_obs, rewards, dones, infos = env.step(joint_action)

            #################################
            agents_action = actions[0]
            #################################

            replay_buffer.push(obs, agents_action, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')

            score += rewards[0][0]

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        scores_window.append(score)
        reward_epi = np.mean(scores_window)
        reward_epi_var = np.var(scores_window)
        logger.add_scalar('results/completion_window' % reward_epi, ep_i)
        logger.add_scalar('results/completion_window' % reward_epi_var, ep_i)
        print(
            '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '.
            format(ep_i, reward_epi, reward_epi_var))

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #15
0
def run(args, **args_dict):
    reward_flag, pos_flag = None, None
    save_data = {'reward': -1000., 'pos': 0.}
    # model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     curr_run = 'run1'
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         curr_run = 'run1'
    #     else:
    #         curr_run = 'run%i' % (max(exst_run_nums) + 1)
    # run_dir = model_dir / curr_run
    # log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)

    th.manual_seed(args.seed)
    np.random.seed(args.seed)
    if not args.use_cuda or not th.cuda.is_available():
        # th.set_num_threads(args.n_training_threads)
        FloatTensor = th.FloatTensor
    else:
        FloatTensor = th.cuda.FloatTensor
    env = make_parallel_env(**args_dict)
    maddpg = MADDPG.init_from_env(env, args)
    replay_buffer = ReplayBuffer(
        args.capacity, args.n_agents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, args.n_episodes, args.n_rollout_threads):
        ttt = time.time()
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        if args.use_cuda and th.cuda.is_available():
            maddpg.prep_rollouts(device='gpu')
        else:
            maddpg.prep_rollouts(device='cpu')
        # maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, args.n_exploration_eps - ep_i) / args.n_exploration_eps
        scale_noise_i = args.final_noise_scale + (
            args.init_noise_scale -
            args.final_noise_scale) * explr_pct_remaining
        maddpg.scale_noise(scale_noise_i)
        maddpg.reset_noise()

        print("Episodes %i-%i of %i, replay: %.2f, explore: %.2f" %
              (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes,
               float(len(replay_buffer)) / replay_buffer.max_steps,
               scale_noise_i))

        for et_i in range(args.max_steps):
            ttt = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                th.from_numpy(np.vstack(obs[:, i])).type(FloatTensor)
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [
                ac.detach().cpu().numpy() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(args.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += args.n_rollout_threads
            #
            # ttt2 = time.time()
            # print('1', ttt2 - ttt)
            #
            if (len(replay_buffer) >= args.batch_size
                    and (t % args.steps_per_update) < args.n_rollout_threads):
                ttt = time.time()
                if args.use_cuda and th.cuda.is_available():
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                # for u_i in range(args.n_rollout_threads):
                for a_i in range(maddpg.nagents):
                    sample = replay_buffer.sample(args.batch_size,
                                                  to_gpu=args.use_cuda
                                                  and th.cuda.is_available(),
                                                  norm_rews=args.norm_rews)
                    _, _, _ = maddpg.update(sample, a_i)
                maddpg.update_all_targets()
                if args.use_cuda and th.cuda.is_available():
                    maddpg.prep_rollouts(device='gpu')
                else:
                    maddpg.prep_rollouts(device='cpu')
                # maddpg.prep_rollouts(device='cpu')
                #
                # ttt2 = time.time()
                # print('2', ttt2 - ttt)
                #
        # ep_rews = replay_buffer.get_average_rewards(
        #     config.episode_length * config.n_rollout_threads)
        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % args.test_interval < args.n_rollout_threads:
            ttt = time.time()
            obs = env.reset()
            if args.use_cuda and th.cuda.is_available():
                maddpg.prep_rollouts(device='gpu')
            else:
                maddpg.prep_rollouts(device='cpu')
            # maddpg.prep_rollouts(device='cpu')
            with th.no_grad():
                pos_total = 0.
                finish_ep = np.zeros(args.n_rollout_threads)
                r_total = np.zeros((args.n_rollout_threads, args.n_agents))
                record_r = np.zeros(args.n_agents)
                for eval_i in range(args.max_steps):
                    torch_obs = [
                        FloatTensor(np.vstack(obs[:, i]))
                        for i in range(maddpg.nagents)
                    ]
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    agent_actions = [
                        ac.detach().cpu().numpy() for ac in torch_agent_actions
                    ]
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(args.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)
                    r_total += rewards
                    obs = next_obs
                    for d_i in range(dones.shape[0]):
                        if dones[d_i] or (eval_i == args.max_steps - 1
                                          and finish_ep[d_i] == 0.):
                            # if eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.:
                            #     print(d_i)
                            pos_total += infos[d_i]['pos']
                            record_r += r_total[d_i]
                            r_total[d_i] = [0., 0.]
                            finish_ep[d_i] += 1
                record_r /= finish_ep.sum()
                pos_total /= finish_ep.sum()

                # ttt2 = time.time()
                # print('3', ttt2 - ttt)
                #

                new_path = model_path + '/' + str(ep_i) + '.pt'
                has_saved = False
                if record_r.sum() > save_data['reward']:
                    save_data['reward'] = record_r.sum()
                    if save_data['reward'] > 0 and pos_total > 10.:
                        # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
                        maddpg.save(new_path)
                if pos_total > save_data['pos']:
                    save_data['pos'] = pos_total
                    if record_r.sum(
                    ) > 0 and pos_total > 10. and not has_saved:
                        # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
                        maddpg.save(new_path)
                if pos_total > 17.0:
                    maddpg.save(new_path)

                if reward_flag is None:
                    reward_flag = vis.line(
                        X=np.arange(ep_i, ep_i + 1),
                        Y=np.array([np.append(record_r, record_r.sum())]),
                        opts=dict(ylabel='Test Reward',
                                  xlabel='Episode',
                                  title='Reward',
                                  legend=[
                                      'Agent-%d' % i
                                      for i in range(args.n_agents)
                                  ] + ['Total']))
                else:
                    vis.line(X=np.array(
                        [np.array(ep_i).repeat(args.n_agents + 1)]),
                             Y=np.array([np.append(record_r, record_r.sum())]),
                             win=reward_flag,
                             update='append')

                if pos_flag is None:
                    pos_flag = vis.line(X=np.arange(ep_i, ep_i + 1),
                                        Y=np.array([pos_total]),
                                        opts=dict(ylabel='Length',
                                                  xlabel='Episode',
                                                  title='How far ?',
                                                  legend=['position']))
                else:
                    vis.line(X=np.array([ep_i]),
                             Y=np.array([pos_total]),
                             win=pos_flag,
                             update='append')
        # if ep_i % config.save_interval < config.n_rollout_threads:
        #     os.makedirs(run_dir / 'incremental', exist_ok=True)
        #     maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
        #     maddpg.save(run_dir / 'model.pt')

    # maddpg.save(run_dir / 'model.pt')
    env.close()
예제 #16
0
파일: transport_main.py 프로젝트: xuezzee/-
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #17
0
def run(config):
    # Make directory to store the results
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)

    # initialize tensorboard summary writer
    logger = SummaryWriter(str(log_dir))

    # use provided seed
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    # IDK how helpful this is
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    global_reward_list = []
    # START EPISODES
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))

        # List of Observations for each of the agents
        # E.g., For simple_spread, shape is {1,3,18}
        obs = env.reset()
        maddpg.prep_rollouts(device='cpu')

        # Exploration percentage remaining. IDK if this is a standard way of doing it however.
        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        # START TIME-STEPS
        episode_reward = 0
        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions (from learning algorithm) as torch Variables. For simple_spread this is discrete[5]
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions
                             ]  # print(torch_agent_actions[0].data)
            # rearrange actions to be per environment. For single thread, it wont really matter.
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            #print(rewards[0][0])
            episode_reward += rewards[0][0]

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')

        global_reward_list.append(episode_reward / (config.episode_length))
        #print(global_reward_list)
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')
    with open("DIFF_rewards.txt", "wb") as fp_:  # Pickling
        pickle.dump(global_reward_list, fp_)
    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()