def run(args):

    if args.exp_name is None:
        exp_layout = collections.OrderedDict([
            ('dicg{}_de_ppo', args.n_gcn_layers),
            ('atype={}', args.attention_type), ('res={}', bool(args.residual)),
            ('entcoeff={}', args.ent), ('dim={}', args.dim),
            ('nagents={}', args.n_agents), ('difficulty={}', args.difficulty),
            ('curr={}', bool(args.curriculum)),
            ('steps={}', args.max_env_steps), ('nenvs={}', args.n_envs),
            ('bs={:0.0e}', args.bs), ('splits={}', args.opt_n_minibatches),
            ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed)
        ])

        exp_name = '_'.join(
            [key.format(val) for key, val in exp_layout.items()])

    else:
        exp_name = args.exp_name

    prefix = 'traffic'
    id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else ''
    unseeded_exp_dir = './data/' + args.loc + '/' + exp_name[:-7]
    exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix

    # Enforce
    args.center_adv = False if args.entropy_method == 'max' else args.center_adv

    if args.mode == 'train':
        # making sequential log dir if name already exists
        @wrap_experiment(name=exp_name,
                         prefix=prefix,
                         log_dir=exp_dir,
                         snapshot_mode='last',
                         snapshot_gap=1)
        def train_predatorprey(ctxt=None, args_dict=vars(args)):
            args = SimpleNamespace(**args_dict)

            set_seed(args.seed)

            if args.curriculum:
                curr_start = int(0.125 * args.n_epochs)
                curr_end = int(0.625 * args.n_epochs)
            else:
                curr_start = 0
                curr_end = 0
                args.add_rate_min = args.add_rate_max

            env = TrafficJunctionWrapper(centralized=True,
                                         dim=args.dim,
                                         vision=1,
                                         add_rate_min=args.add_rate_min,
                                         add_rate_max=args.add_rate_max,
                                         curr_start=curr_start,
                                         curr_end=curr_end,
                                         difficulty=args.difficulty,
                                         n_agents=args.n_agents,
                                         max_steps=args.max_env_steps)
            env = GarageEnv(env)

            runner = LocalRunnerWrapper(ctxt,
                                        eval=args.eval_during_training,
                                        n_eval_episodes=args.n_eval_episodes,
                                        eval_greedy=args.eval_greedy,
                                        eval_epoch_freq=args.eval_epoch_freq,
                                        save_env=env.pickleable)

            hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \
                                    else torch.tanh

            policy = DecCategoricalMLPPolicy(
                env.spec,
                env.n_agents,
                hidden_nonlinearity=hidden_nonlinearity,
                hidden_sizes=args.policy_hidden_sizes,
                name='dec_categorical_mlp_policy')

            baseline = DICGCritic(
                env.spec,
                env.n_agents,
                encoder_hidden_sizes=args.encoder_hidden_sizes,
                embedding_dim=args.embedding_dim,
                attention_type=args.attention_type,
                n_gcn_layers=args.n_gcn_layers,
                residual=args.residual,
                gcn_bias=args.gcn_bias,
                name='dicg_critic')

            # Set max_path_length <= max_steps
            # If max_path_length > max_steps, algo will pad obs
            # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim])
            algo = CentralizedMAPPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=args.max_env_steps, # Notice
                discount=args.discount,
                center_adv=bool(args.center_adv),
                positive_adv=bool(args.positive_adv),
                gae_lambda=args.gae_lambda,
                policy_ent_coeff=args.ent,
                entropy_method=args.entropy_method,
                stop_entropy_gradient=True \
                   if args.entropy_method == 'max' else False,
                clip_grad_norm=args.clip_grad_norm,
                optimization_n_minibatches=args.opt_n_minibatches,
                optimization_mini_epochs=args.opt_mini_epochs,
            )

            runner.setup(algo,
                         env,
                         sampler_cls=CentralizedMAOnPolicyVectorizedSampler,
                         sampler_args={'n_envs': args.n_envs})
            runner.train(n_epochs=args.n_epochs, batch_size=args.bs)

        train_predatorprey(args_dict=vars(args))

    elif args.mode in ['restore', 'eval']:
        data = joblib.load(exp_dir + '/params.pkl')
        env = data['env']
        algo = data['algo']

        if args.mode == 'restore':
            from dicg.experiment.runner_utils import restore_training
            restore_training(exp_dir,
                             exp_name,
                             args,
                             env_saved=env.pickleable,
                             env=env)

        elif args.mode == 'eval':
            env.eval(algo.policy,
                     n_episodes=args.n_eval_episodes,
                     greedy=args.eval_greedy,
                     load_from_file=True,
                     max_steps=args.max_env_steps)
Beispiel #2
0
def run(args):

    if args.exp_name is None:
        exp_layout = collections.OrderedDict([
            ('centralized_ppo{}', ''),
            ('incact={}', bool(args.state_include_actions)),
            ('entcoeff={}', args.ent), ('map={}', args.map),
            ('difficulty={}', args.difficulty), ('bs={:0.0e}', args.bs),
            ('nenvs={}', args.n_envs), ('splits={}', args.opt_n_minibatches),
            ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed)
        ])

        exp_name = '_'.join(
            [key.format(val) for key, val in exp_layout.items()])

        if args.comment != '':
            exp_name = exp_name + '_' + args.comment
    else:
        exp_name = args.exp_name

    prefix = 'smac'
    id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else ''
    exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix

    # Enforce
    args.center_adv = False if args.entropy_method == 'max' else args.center_adv
    set_seed(args.seed)

    if args.mode == 'train':
        # making sequential log dir if name already exists
        @wrap_experiment(name=exp_name,
                         prefix=prefix,
                         log_dir=exp_dir,
                         snapshot_mode='last',
                         snapshot_gap=1)
        def train_smac(ctxt=None, args_dict=vars(args)):
            args = SimpleNamespace(**args_dict)

            env = SMACWrapper(
                centralized=True,
                map_name=args.map,
                difficulty=args.difficulty,
                # seed=args.seed
            )
            env = GarageEnv(env)

            runner = LocalRunnerWrapper(ctxt,
                                        eval=args.eval_during_training,
                                        n_eval_episodes=args.n_eval_episodes,
                                        eval_greedy=args.eval_greedy,
                                        eval_epoch_freq=args.eval_epoch_freq,
                                        save_env=env.pickleable)

            hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \
                                    else torch.tanh
            # Enforce hidden sizes
            if env.n_agents in [6, 7, 8]:
                args.encoder_hidden_sizes = [700, 350]
                args.embedding_dim = 200
            elif env.n_agents <= 3:
                args.encoder_hidden_sizes = [300, 150]
                args.embedding_dim = 128

            policy = CentralizedCategoricalLSTMPolicy(
                env.spec,
                n_agents=env.n_agents,
                encoder_hidden_sizes=args.encoder_hidden_sizes,
                embedding_dim=args.embedding_dim,  # encoder output size
                lstm_hidden_size=args.lstm_hidden_size,
                state_include_actions=args.state_include_actions,
                name='centralized_categorical_lstm_policy')

            baseline = GaussianMLPBaseline(env_spec=env.spec,
                                           hidden_sizes=(64, 64, 64))

            # Set max_path_length <= max_steps
            # If max_path_length > max_steps, algo will pad obs
            # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim])
            algo = CentralizedMAPPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=env.episode_limit, # Notice
                discount=args.discount,
                center_adv=bool(args.center_adv),
                positive_adv=bool(args.positive_adv),
                gae_lambda=args.gae_lambda,
                policy_ent_coeff=args.ent,
                entropy_method=args.entropy_method,
                stop_entropy_gradient=True \
                   if args.entropy_method == 'max' else False,
                clip_grad_norm=args.clip_grad_norm,
                optimization_n_minibatches=args.opt_n_minibatches,
                optimization_mini_epochs=args.opt_mini_epochs,
            )

            runner.setup(algo,
                         env,
                         sampler_cls=CentralizedMAOnPolicyVectorizedSampler,
                         sampler_args={'n_envs': args.n_envs})
            runner.train(n_epochs=args.n_epochs, batch_size=args.bs)

        train_smac(args_dict=vars(args))

    elif args.mode in ['restore', 'eval']:
        env = SMACWrapper(
            centralized=True,
            map_name=args.map,
            difficulty=args.difficulty,
            replay_dir=exp_dir,
            replay_prefix='cent_lstm',
            # seed=args.seed
        )
        if args.mode == 'restore':
            from dicg.experiment.runner_utils import restore_training
            env = GarageEnv(env)
            restore_training(exp_dir, exp_name, args, env_saved=False, env=env)

        elif args.mode == 'eval':
            data = joblib.load(exp_dir + '/params.pkl')
            algo = data['algo']
            env.eval(algo.policy,
                     n_episodes=args.n_eval_episodes,
                     greedy=args.eval_greedy,
                     load_from_file=True,
                     save_replay=args.save_replay)
            env.close()
Beispiel #3
0
def run(args):

    if args.exp_name is None:
        exp_layout = collections.OrderedDict([
            ('dicg{}_ce_ppo', args.n_gcn_layers),
            ('atype={}', args.attention_type), ('res={}', bool(args.residual)),
            ('entcoeff={}', args.ent), ('grid={}', args.grid_size),
            ('nagents={}', args.n_agents), ('npreys={}', args.n_preys),
            ('penalty={:.2f}', args.penalty),
            ('stepcost={:.2f}', args.step_cost),
            ('avis={}', bool(args.agent_visible)),
            ('steps={}', args.max_env_steps), ('nenvs={}', args.n_envs),
            ('bs={:0.0e}', args.bs), ('splits={}', args.opt_n_minibatches),
            ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed)
        ])

        exp_name = '_'.join(
            [key.format(val) for key, val in exp_layout.items()])

    else:
        exp_name = args.exp_name

    prefix = 'predatorprey'
    id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else ''
    unseeded_exp_dir = './data/' + args.loc + '/' + exp_name[:-7]
    exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix

    # Enforce
    args.center_adv = False if args.entropy_method == 'max' else args.center_adv

    if args.mode == 'train':
        # making sequential log dir if name already exists
        @wrap_experiment(name=exp_name,
                         prefix=prefix,
                         log_dir=exp_dir,
                         snapshot_mode='last',
                         snapshot_gap=1)
        def train_predatorprey(ctxt=None, args_dict=vars(args)):
            args = SimpleNamespace(**args_dict)

            set_seed(args.seed)

            env = PredatorPreyWrapper(
                centralized=True,
                grid_shape=(args.grid_size, args.grid_size),
                n_agents=args.n_agents,
                n_preys=args.n_preys,
                max_steps=args.max_env_steps,
                step_cost=args.step_cost,
                prey_capture_reward=args.capture_reward,
                penalty=args.penalty,
                other_agent_visible=bool(args.agent_visible))
            env = GarageEnv(env)

            runner = LocalRunnerWrapper(ctxt,
                                        eval=args.eval_during_training,
                                        n_eval_episodes=args.n_eval_episodes,
                                        eval_greedy=args.eval_greedy,
                                        eval_epoch_freq=args.eval_epoch_freq,
                                        save_env=env.pickleable)

            hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \
                                    else torch.tanh
            policy = DICGCECategoricalMLPPolicy(
                env.spec,
                n_agents=args.n_agents,
                encoder_hidden_sizes=args.encoder_hidden_sizes,
                embedding_dim=args.embedding_dim,
                attention_type=args.attention_type,
                n_gcn_layers=args.n_gcn_layers,
                residual=bool(args.residual),
                gcn_bias=bool(args.gcn_bias),
                categorical_mlp_hidden_sizes=args.categorical_mlp_hidden_sizes,
            )

            baseline = GaussianMLPBaseline(env_spec=env.spec,
                                           hidden_sizes=(64, 64, 64))

            # Set max_path_length <= max_steps
            # If max_path_length > max_steps, algo will pad obs
            # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim])
            algo = CentralizedMAPPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=args.max_env_steps, # Notice
                discount=args.discount,
                center_adv=bool(args.center_adv),
                positive_adv=bool(args.positive_adv),
                gae_lambda=args.gae_lambda,
                policy_ent_coeff=args.ent,
                entropy_method=args.entropy_method,
                stop_entropy_gradient=True \
                   if args.entropy_method == 'max' else False,
                clip_grad_norm=args.clip_grad_norm,
                optimization_n_minibatches=args.opt_n_minibatches,
                optimization_mini_epochs=args.opt_mini_epochs,
            )

            runner.setup(algo,
                         env,
                         sampler_cls=CentralizedMAOnPolicyVectorizedSampler,
                         sampler_args={'n_envs': args.n_envs})
            runner.train(n_epochs=args.n_epochs, batch_size=args.bs)

        train_predatorprey(args_dict=vars(args))

    elif args.mode in ['restore', 'eval']:
        data = joblib.load(exp_dir + '/params.pkl')
        env = data['env']
        algo = data['algo']

        if args.mode == 'restore':
            from dicg.experiment.runner_utils import restore_training
            restore_training(exp_dir,
                             exp_name,
                             args,
                             env_saved=env.pickleable,
                             env=env)

        elif args.mode == 'eval':
            # Eval stats:
            distance_vs_weight = {}
            traj_len = []
            for i_eps in range(args.n_eval_episodes):
                print('Eval episode: {}/{}'.format(i_eps + 1,
                                                   args.n_eval_episodes))
                obses = env.reset()
                algo.policy.reset([True])
                for i_step in range(args.max_env_steps):
                    actions, agent_infos = algo.policy.get_actions(
                        obses,
                        env.get_avail_actions(),
                        greedy=args.eval_greedy)
                    attention_weights_0 = agent_infos['attention_weights'][0]
                    for i_agent in range(env.n_agents):
                        d = np.sqrt((env.agent_pos[0][0] -
                                     env.agent_pos[i_agent][0])**2 +
                                    (env.agent_pos[0][1] -
                                     env.agent_pos[i_agent][1])**2)
                        if d not in distance_vs_weight.keys():
                            distance_vs_weight[d] = [
                                attention_weights_0[i_agent]
                            ]
                        else:
                            distance_vs_weight[d].append(
                                attention_weights_0[i_agent])

                    if bool(args.render):
                        env.my_render(attention_weights=attention_weights_0)
                        if bool(args.inspect_steps):
                            input('Step {}, press Enter to continue...'.format(
                                i_step))
                        else:
                            time.sleep(0.05)

                    obses, _, agent_dones, _ = env.step(actions)

                    if agent_dones:
                        if i_step < args.max_env_steps - 1:
                            traj_len.append(i_step + 1)
                            print(
                                'eps {} captured all preys in {} steps'.format(
                                    i_eps + 1, i_step + 1))
                        break
            env.close()
            print('Average trajectory length = {}'.format(np.mean(traj_len)))

            from .attention_stats import plot_attn_stats
            plot_attn_stats(distance_vs_weight, exp_dir)

    elif args.mode == 'analysis':
        from tests.predatorprey.attention_stats import attn_analysis
        attn_analysis(unseeded_exp_dir, args, seeds=[1])
Beispiel #4
0
def run(args):

    if args.exp_name is None:
        exp_layout = collections.OrderedDict([
            ('centralized_ppo{}', ''), ('entcoeff={}', args.ent),
            ('grid={}', args.grid_size), ('nagents={}', args.n_agents),
            ('npreys={}', args.n_preys), ('penalty={:.2f}', args.penalty),
            ('stepcost={:.2f}', args.step_cost),
            ('avis={}', bool(args.agent_visible)),
            ('steps={}', args.max_env_steps), ('nenvs={}', args.n_envs),
            ('bs={:0.0e}', args.bs), ('splits={}', args.opt_n_minibatches),
            ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed)
        ])

        exp_name = '_'.join(
            [key.format(val) for key, val in exp_layout.items()])

    else:
        exp_name = args.exp_name

    prefix = 'predatorprey'
    id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else ''
    exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix

    # Enforce
    args.center_adv = False if args.entropy_method == 'max' else args.center_adv

    if args.mode == 'train':
        # making sequential log dir if name already exists
        @wrap_experiment(name=exp_name,
                         prefix=prefix,
                         log_dir=exp_dir,
                         snapshot_mode='last',
                         snapshot_gap=1)
        def train_predatorprey(ctxt=None, args_dict=vars(args)):
            args = SimpleNamespace(**args_dict)

            set_seed(args.seed)

            env = PredatorPreyWrapper(centralized=True,
                                      grid_shape=(args.grid_size,
                                                  args.grid_size),
                                      n_agents=args.n_agents,
                                      n_preys=args.n_preys,
                                      step_cost=args.step_cost,
                                      max_steps=args.max_env_steps,
                                      prey_capture_reward=args.capture_reward,
                                      penalty=args.penalty,
                                      other_agent_visible=args.agent_visible)
            env = GarageEnv(env)

            runner = LocalRunnerWrapper(ctxt,
                                        eval=args.eval_during_training,
                                        n_eval_episodes=args.n_eval_episodes,
                                        eval_greedy=args.eval_greedy,
                                        eval_epoch_freq=args.eval_epoch_freq,
                                        save_env=env.pickleable)
            # logdir = runner._snapshotter._snapshot_dir

            hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \
                                    else torch.tanh
            policy = CentralizedCategoricalMLPPolicy(
                env.spec,
                n_agents=args.n_agents,
                hidden_nonlinearity=hidden_nonlinearity,
                hidden_sizes=args.hidden_sizes,
                name='centralized')

            baseline = GaussianMLPBaseline(env_spec=env.spec,
                                           hidden_sizes=(64, 64, 64))

            # Set max_path_length <= max_steps
            # If max_path_length > max_steps, algo will pad obs
            # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim])
            algo = CentralizedMAPPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=args.max_env_steps, # Notice
                discount=args.discount,
                center_adv=bool(args.center_adv),
                positive_adv=bool(args.positive_adv),
                gae_lambda=args.gae_lambda,
                policy_ent_coeff=args.ent,
                entropy_method=args.entropy_method,
                stop_entropy_gradient = True \
                   if args.entropy_method == 'max' else False,
                optimization_n_minibatches=args.opt_n_minibatches,
                optimization_mini_epochs=args.opt_mini_epochs,
            )

            runner.setup(algo,
                         env,
                         sampler_cls=CentralizedMAOnPolicyVectorizedSampler,
                         sampler_args={'n_envs': args.n_envs})
            runner.train(n_epochs=args.n_epochs, batch_size=args.bs)

        train_predatorprey(args_dict=vars(args))

    elif args.mode in ['restore', 'eval']:
        data = joblib.load(exp_dir + '/params.pkl')
        algo = data['algo']
        env = data['env']

        if args.mode == 'restore':
            from dicg.experiment.runner_utils import restore_training
            restore_training(exp_dir,
                             exp_name,
                             args,
                             env_saved=env.pickleable,
                             env=env)

        elif args.mode == 'eval':
            env.eval(algo.policy,
                     n_episodes=args.n_eval_episodes,
                     greedy=args.eval_greedy,
                     load_from_file=True,
                     render=args.render)
            env.close()