def run(args): if args.exp_name is None: exp_layout = collections.OrderedDict([ ('dicg{}_de_ppo', args.n_gcn_layers), ('atype={}', args.attention_type), ('res={}', bool(args.residual)), ('entcoeff={}', args.ent), ('dim={}', args.dim), ('nagents={}', args.n_agents), ('difficulty={}', args.difficulty), ('curr={}', bool(args.curriculum)), ('steps={}', args.max_env_steps), ('nenvs={}', args.n_envs), ('bs={:0.0e}', args.bs), ('splits={}', args.opt_n_minibatches), ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed) ]) exp_name = '_'.join( [key.format(val) for key, val in exp_layout.items()]) else: exp_name = args.exp_name prefix = 'traffic' id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else '' unseeded_exp_dir = './data/' + args.loc + '/' + exp_name[:-7] exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix # Enforce args.center_adv = False if args.entropy_method == 'max' else args.center_adv if args.mode == 'train': # making sequential log dir if name already exists @wrap_experiment(name=exp_name, prefix=prefix, log_dir=exp_dir, snapshot_mode='last', snapshot_gap=1) def train_predatorprey(ctxt=None, args_dict=vars(args)): args = SimpleNamespace(**args_dict) set_seed(args.seed) if args.curriculum: curr_start = int(0.125 * args.n_epochs) curr_end = int(0.625 * args.n_epochs) else: curr_start = 0 curr_end = 0 args.add_rate_min = args.add_rate_max env = TrafficJunctionWrapper(centralized=True, dim=args.dim, vision=1, add_rate_min=args.add_rate_min, add_rate_max=args.add_rate_max, curr_start=curr_start, curr_end=curr_end, difficulty=args.difficulty, n_agents=args.n_agents, max_steps=args.max_env_steps) env = GarageEnv(env) runner = LocalRunnerWrapper(ctxt, eval=args.eval_during_training, n_eval_episodes=args.n_eval_episodes, eval_greedy=args.eval_greedy, eval_epoch_freq=args.eval_epoch_freq, save_env=env.pickleable) hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \ else torch.tanh policy = DecCategoricalMLPPolicy( env.spec, env.n_agents, hidden_nonlinearity=hidden_nonlinearity, hidden_sizes=args.policy_hidden_sizes, name='dec_categorical_mlp_policy') baseline = DICGCritic( env.spec, env.n_agents, encoder_hidden_sizes=args.encoder_hidden_sizes, embedding_dim=args.embedding_dim, attention_type=args.attention_type, n_gcn_layers=args.n_gcn_layers, residual=args.residual, gcn_bias=args.gcn_bias, name='dicg_critic') # Set max_path_length <= max_steps # If max_path_length > max_steps, algo will pad obs # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim]) algo = CentralizedMAPPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=args.max_env_steps, # Notice discount=args.discount, center_adv=bool(args.center_adv), positive_adv=bool(args.positive_adv), gae_lambda=args.gae_lambda, policy_ent_coeff=args.ent, entropy_method=args.entropy_method, stop_entropy_gradient=True \ if args.entropy_method == 'max' else False, clip_grad_norm=args.clip_grad_norm, optimization_n_minibatches=args.opt_n_minibatches, optimization_mini_epochs=args.opt_mini_epochs, ) runner.setup(algo, env, sampler_cls=CentralizedMAOnPolicyVectorizedSampler, sampler_args={'n_envs': args.n_envs}) runner.train(n_epochs=args.n_epochs, batch_size=args.bs) train_predatorprey(args_dict=vars(args)) elif args.mode in ['restore', 'eval']: data = joblib.load(exp_dir + '/params.pkl') env = data['env'] algo = data['algo'] if args.mode == 'restore': from dicg.experiment.runner_utils import restore_training restore_training(exp_dir, exp_name, args, env_saved=env.pickleable, env=env) elif args.mode == 'eval': env.eval(algo.policy, n_episodes=args.n_eval_episodes, greedy=args.eval_greedy, load_from_file=True, max_steps=args.max_env_steps)
def run(args): if args.exp_name is None: exp_layout = collections.OrderedDict([ ('centralized_ppo{}', ''), ('incact={}', bool(args.state_include_actions)), ('entcoeff={}', args.ent), ('map={}', args.map), ('difficulty={}', args.difficulty), ('bs={:0.0e}', args.bs), ('nenvs={}', args.n_envs), ('splits={}', args.opt_n_minibatches), ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed) ]) exp_name = '_'.join( [key.format(val) for key, val in exp_layout.items()]) if args.comment != '': exp_name = exp_name + '_' + args.comment else: exp_name = args.exp_name prefix = 'smac' id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else '' exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix # Enforce args.center_adv = False if args.entropy_method == 'max' else args.center_adv set_seed(args.seed) if args.mode == 'train': # making sequential log dir if name already exists @wrap_experiment(name=exp_name, prefix=prefix, log_dir=exp_dir, snapshot_mode='last', snapshot_gap=1) def train_smac(ctxt=None, args_dict=vars(args)): args = SimpleNamespace(**args_dict) env = SMACWrapper( centralized=True, map_name=args.map, difficulty=args.difficulty, # seed=args.seed ) env = GarageEnv(env) runner = LocalRunnerWrapper(ctxt, eval=args.eval_during_training, n_eval_episodes=args.n_eval_episodes, eval_greedy=args.eval_greedy, eval_epoch_freq=args.eval_epoch_freq, save_env=env.pickleable) hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \ else torch.tanh # Enforce hidden sizes if env.n_agents in [6, 7, 8]: args.encoder_hidden_sizes = [700, 350] args.embedding_dim = 200 elif env.n_agents <= 3: args.encoder_hidden_sizes = [300, 150] args.embedding_dim = 128 policy = CentralizedCategoricalLSTMPolicy( env.spec, n_agents=env.n_agents, encoder_hidden_sizes=args.encoder_hidden_sizes, embedding_dim=args.embedding_dim, # encoder output size lstm_hidden_size=args.lstm_hidden_size, state_include_actions=args.state_include_actions, name='centralized_categorical_lstm_policy') baseline = GaussianMLPBaseline(env_spec=env.spec, hidden_sizes=(64, 64, 64)) # Set max_path_length <= max_steps # If max_path_length > max_steps, algo will pad obs # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim]) algo = CentralizedMAPPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=env.episode_limit, # Notice discount=args.discount, center_adv=bool(args.center_adv), positive_adv=bool(args.positive_adv), gae_lambda=args.gae_lambda, policy_ent_coeff=args.ent, entropy_method=args.entropy_method, stop_entropy_gradient=True \ if args.entropy_method == 'max' else False, clip_grad_norm=args.clip_grad_norm, optimization_n_minibatches=args.opt_n_minibatches, optimization_mini_epochs=args.opt_mini_epochs, ) runner.setup(algo, env, sampler_cls=CentralizedMAOnPolicyVectorizedSampler, sampler_args={'n_envs': args.n_envs}) runner.train(n_epochs=args.n_epochs, batch_size=args.bs) train_smac(args_dict=vars(args)) elif args.mode in ['restore', 'eval']: env = SMACWrapper( centralized=True, map_name=args.map, difficulty=args.difficulty, replay_dir=exp_dir, replay_prefix='cent_lstm', # seed=args.seed ) if args.mode == 'restore': from dicg.experiment.runner_utils import restore_training env = GarageEnv(env) restore_training(exp_dir, exp_name, args, env_saved=False, env=env) elif args.mode == 'eval': data = joblib.load(exp_dir + '/params.pkl') algo = data['algo'] env.eval(algo.policy, n_episodes=args.n_eval_episodes, greedy=args.eval_greedy, load_from_file=True, save_replay=args.save_replay) env.close()
def run(args): if args.exp_name is None: exp_layout = collections.OrderedDict([ ('dicg{}_ce_ppo', args.n_gcn_layers), ('atype={}', args.attention_type), ('res={}', bool(args.residual)), ('entcoeff={}', args.ent), ('grid={}', args.grid_size), ('nagents={}', args.n_agents), ('npreys={}', args.n_preys), ('penalty={:.2f}', args.penalty), ('stepcost={:.2f}', args.step_cost), ('avis={}', bool(args.agent_visible)), ('steps={}', args.max_env_steps), ('nenvs={}', args.n_envs), ('bs={:0.0e}', args.bs), ('splits={}', args.opt_n_minibatches), ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed) ]) exp_name = '_'.join( [key.format(val) for key, val in exp_layout.items()]) else: exp_name = args.exp_name prefix = 'predatorprey' id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else '' unseeded_exp_dir = './data/' + args.loc + '/' + exp_name[:-7] exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix # Enforce args.center_adv = False if args.entropy_method == 'max' else args.center_adv if args.mode == 'train': # making sequential log dir if name already exists @wrap_experiment(name=exp_name, prefix=prefix, log_dir=exp_dir, snapshot_mode='last', snapshot_gap=1) def train_predatorprey(ctxt=None, args_dict=vars(args)): args = SimpleNamespace(**args_dict) set_seed(args.seed) env = PredatorPreyWrapper( centralized=True, grid_shape=(args.grid_size, args.grid_size), n_agents=args.n_agents, n_preys=args.n_preys, max_steps=args.max_env_steps, step_cost=args.step_cost, prey_capture_reward=args.capture_reward, penalty=args.penalty, other_agent_visible=bool(args.agent_visible)) env = GarageEnv(env) runner = LocalRunnerWrapper(ctxt, eval=args.eval_during_training, n_eval_episodes=args.n_eval_episodes, eval_greedy=args.eval_greedy, eval_epoch_freq=args.eval_epoch_freq, save_env=env.pickleable) hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \ else torch.tanh policy = DICGCECategoricalMLPPolicy( env.spec, n_agents=args.n_agents, encoder_hidden_sizes=args.encoder_hidden_sizes, embedding_dim=args.embedding_dim, attention_type=args.attention_type, n_gcn_layers=args.n_gcn_layers, residual=bool(args.residual), gcn_bias=bool(args.gcn_bias), categorical_mlp_hidden_sizes=args.categorical_mlp_hidden_sizes, ) baseline = GaussianMLPBaseline(env_spec=env.spec, hidden_sizes=(64, 64, 64)) # Set max_path_length <= max_steps # If max_path_length > max_steps, algo will pad obs # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim]) algo = CentralizedMAPPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=args.max_env_steps, # Notice discount=args.discount, center_adv=bool(args.center_adv), positive_adv=bool(args.positive_adv), gae_lambda=args.gae_lambda, policy_ent_coeff=args.ent, entropy_method=args.entropy_method, stop_entropy_gradient=True \ if args.entropy_method == 'max' else False, clip_grad_norm=args.clip_grad_norm, optimization_n_minibatches=args.opt_n_minibatches, optimization_mini_epochs=args.opt_mini_epochs, ) runner.setup(algo, env, sampler_cls=CentralizedMAOnPolicyVectorizedSampler, sampler_args={'n_envs': args.n_envs}) runner.train(n_epochs=args.n_epochs, batch_size=args.bs) train_predatorprey(args_dict=vars(args)) elif args.mode in ['restore', 'eval']: data = joblib.load(exp_dir + '/params.pkl') env = data['env'] algo = data['algo'] if args.mode == 'restore': from dicg.experiment.runner_utils import restore_training restore_training(exp_dir, exp_name, args, env_saved=env.pickleable, env=env) elif args.mode == 'eval': # Eval stats: distance_vs_weight = {} traj_len = [] for i_eps in range(args.n_eval_episodes): print('Eval episode: {}/{}'.format(i_eps + 1, args.n_eval_episodes)) obses = env.reset() algo.policy.reset([True]) for i_step in range(args.max_env_steps): actions, agent_infos = algo.policy.get_actions( obses, env.get_avail_actions(), greedy=args.eval_greedy) attention_weights_0 = agent_infos['attention_weights'][0] for i_agent in range(env.n_agents): d = np.sqrt((env.agent_pos[0][0] - env.agent_pos[i_agent][0])**2 + (env.agent_pos[0][1] - env.agent_pos[i_agent][1])**2) if d not in distance_vs_weight.keys(): distance_vs_weight[d] = [ attention_weights_0[i_agent] ] else: distance_vs_weight[d].append( attention_weights_0[i_agent]) if bool(args.render): env.my_render(attention_weights=attention_weights_0) if bool(args.inspect_steps): input('Step {}, press Enter to continue...'.format( i_step)) else: time.sleep(0.05) obses, _, agent_dones, _ = env.step(actions) if agent_dones: if i_step < args.max_env_steps - 1: traj_len.append(i_step + 1) print( 'eps {} captured all preys in {} steps'.format( i_eps + 1, i_step + 1)) break env.close() print('Average trajectory length = {}'.format(np.mean(traj_len))) from .attention_stats import plot_attn_stats plot_attn_stats(distance_vs_weight, exp_dir) elif args.mode == 'analysis': from tests.predatorprey.attention_stats import attn_analysis attn_analysis(unseeded_exp_dir, args, seeds=[1])
def run(args): if args.exp_name is None: exp_layout = collections.OrderedDict([ ('centralized_ppo{}', ''), ('entcoeff={}', args.ent), ('grid={}', args.grid_size), ('nagents={}', args.n_agents), ('npreys={}', args.n_preys), ('penalty={:.2f}', args.penalty), ('stepcost={:.2f}', args.step_cost), ('avis={}', bool(args.agent_visible)), ('steps={}', args.max_env_steps), ('nenvs={}', args.n_envs), ('bs={:0.0e}', args.bs), ('splits={}', args.opt_n_minibatches), ('miniepoch={}', args.opt_mini_epochs), ('seed={}', args.seed) ]) exp_name = '_'.join( [key.format(val) for key, val in exp_layout.items()]) else: exp_name = args.exp_name prefix = 'predatorprey' id_suffix = ('_' + str(args.run_id)) if args.run_id != 0 else '' exp_dir = './data/' + args.loc + '/' + exp_name + id_suffix # Enforce args.center_adv = False if args.entropy_method == 'max' else args.center_adv if args.mode == 'train': # making sequential log dir if name already exists @wrap_experiment(name=exp_name, prefix=prefix, log_dir=exp_dir, snapshot_mode='last', snapshot_gap=1) def train_predatorprey(ctxt=None, args_dict=vars(args)): args = SimpleNamespace(**args_dict) set_seed(args.seed) env = PredatorPreyWrapper(centralized=True, grid_shape=(args.grid_size, args.grid_size), n_agents=args.n_agents, n_preys=args.n_preys, step_cost=args.step_cost, max_steps=args.max_env_steps, prey_capture_reward=args.capture_reward, penalty=args.penalty, other_agent_visible=args.agent_visible) env = GarageEnv(env) runner = LocalRunnerWrapper(ctxt, eval=args.eval_during_training, n_eval_episodes=args.n_eval_episodes, eval_greedy=args.eval_greedy, eval_epoch_freq=args.eval_epoch_freq, save_env=env.pickleable) # logdir = runner._snapshotter._snapshot_dir hidden_nonlinearity = F.relu if args.hidden_nonlinearity == 'relu' \ else torch.tanh policy = CentralizedCategoricalMLPPolicy( env.spec, n_agents=args.n_agents, hidden_nonlinearity=hidden_nonlinearity, hidden_sizes=args.hidden_sizes, name='centralized') baseline = GaussianMLPBaseline(env_spec=env.spec, hidden_sizes=(64, 64, 64)) # Set max_path_length <= max_steps # If max_path_length > max_steps, algo will pad obs # obs.shape = torch.Size([n_paths, algo.max_path_length, feat_dim]) algo = CentralizedMAPPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=args.max_env_steps, # Notice discount=args.discount, center_adv=bool(args.center_adv), positive_adv=bool(args.positive_adv), gae_lambda=args.gae_lambda, policy_ent_coeff=args.ent, entropy_method=args.entropy_method, stop_entropy_gradient = True \ if args.entropy_method == 'max' else False, optimization_n_minibatches=args.opt_n_minibatches, optimization_mini_epochs=args.opt_mini_epochs, ) runner.setup(algo, env, sampler_cls=CentralizedMAOnPolicyVectorizedSampler, sampler_args={'n_envs': args.n_envs}) runner.train(n_epochs=args.n_epochs, batch_size=args.bs) train_predatorprey(args_dict=vars(args)) elif args.mode in ['restore', 'eval']: data = joblib.load(exp_dir + '/params.pkl') algo = data['algo'] env = data['env'] if args.mode == 'restore': from dicg.experiment.runner_utils import restore_training restore_training(exp_dir, exp_name, args, env_saved=env.pickleable, env=env) elif args.mode == 'eval': env.eval(algo.policy, n_episodes=args.n_eval_episodes, greedy=args.eval_greedy, load_from_file=True, render=args.render) env.close()