def create_trajectories(args): assert args.num_episodes > 0 assert os.path.isfile(args.model_path) assert os.path.isfile(args.config_path) os.makedirs(os.path.dirname(args.save_path), exist_ok=True) set_seed(args.seed) with open(args.config_path, "rb") as f: config = pickle.load(f) env_setting = config[c.ENV_SETTING] env_setting[c.ENV_WRAPPERS][0][c.KWARGS][c.CREATE_ABSORBING_STATE] = True env_setting[c.ENV_WRAPPERS][0][c.KWARGS][c.MAX_EPISODE_LENGTH] = 1000 env = make_env(env_setting, seed=args.seed) model = make_model(config[c.MODEL_SETTING]) model.load_state_dict(torch.load(args.model_path)[c.STATE_DICT]) agent = ACAgent(model=model, learning_algorithm=None, preprocess=config[c.EVALUATION_PREPROCESSING]) config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = args.num_steps config[c.BUFFER_SETTING][c.STORE_NEXT_OBSERVATION] = True buffer_preprocessing = config[c.BUFFER_PREPROCESSING] expert_buffer = make_buffer(config[c.BUFFER_SETTING], args.seed) config[c.NUM_STEPS] = args.num_steps config[c.NUM_EPISODES] = args.num_episodes def transition_preprocess(obs, h_state, action, reward, done, info, next_obs, next_h_state): if obs[:, -1] == 1: action[:] = 0 return { "obs": obs, "h_state": h_state, "act": action, "rew": [reward], "done": False, "info": info, "next_obs": next_obs, "next_h_state": next_h_state, } buffer_warmup(agent=agent, env=env, buffer=expert_buffer, buffer_preprocess=buffer_preprocessing, transition_preprocess=transition_preprocess, experiment_settings=config) expert_buffer.save(save_path=args.save_path, end_with_done=False)
def train_sac(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) model = make_model(experiment_config[c.MODEL_SETTING]) buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = SAC(model=model, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent( model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def evaluate(args): set_seed(args.seed) assert args.num_episodes > 0 config, env, buffer_preprocessing, agent = load_model( args.seed, args.config_path, args.model_path, args.device, args.intention) if c.AUXILIARY_REWARDS in config: auxiliary_reward = config[c.AUXILIARY_REWARDS].reward else: auxiliary_reward = lambda reward, **kwargs: np.array([reward]) rets = evaluate_policy( agent=agent, env=env, buffer_preprocess=buffer_preprocessing, num_episodes=args.num_episodes, clip_action=config[c.CLIP_ACTION], min_action=config[c.MIN_ACTION], max_action=config[c.MAX_ACTION], render=args.render, auxiliary_reward=auxiliary_reward, verbose=True, ) print("=" * 100) print("Interacted with {} episodes".format(args.num_episodes)) print("Average Return: {} - Std: {}".format(np.mean(rets, axis=1), np.std(rets, axis=1)))
def train_bc(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) model = make_model(experiment_config[c.MODEL_SETTING]) expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, expert_buffer, experiment_config) learning_algorithm = BC(model=model, optimizer=optimizer, expert_buffer=expert_buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent(model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_sac_diayn(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) model = make_model(experiment_config[c.MODEL_SETTING]) discriminator = make_model(experiment_config[c.DISCRIMINATOR_SETTING]) prior = experiment_config[c.PRIOR] buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) discriminator_opt = make_optimizer( discriminator.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.DISCRIMINATOR]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = SACDIAYN(model=model, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) diayn = DIAYN(discriminator=discriminator, prior=prior, discriminator_opt=discriminator_opt, learning_algorithm=learning_algorithm, algo_params=experiment_config) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = DIAYNAgent( prior=prior, model=model, learning_algorithm=diayn, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = DIAYNAgent( prior=prior, model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) class GetTask: def __init__(self, agent): self.agent = agent def __call__(self, obs): # Concatenate task to the end of observation return np.concatenate((obs, self.agent.curr_high_level_act), axis=-1) def reset(self): pass buffer_preprocessing = Compose([buffer_preprocessing, GetTask(agent)]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def collect_data(args): set_seed(args.seed) assert args.num_episodes > 0 assert args.num_samples > 0 assert 0 <= args.mixture_ratio <= 1 dir_exists = os.path.isdir(args.save_path) assert dir_exists or not os.path.exists(args.save_path) if not dir_exists: os.makedirs(args.save_path, exist_ok=True) config, env, buffer_preprocess, agent = load_model(args.seed, args.config_path, args.model_path, args.device, args.intention) init_observations = [] observations = [] actions = [] rewards = [] dones = [] episodes_pbar = tqdm(total=args.num_episodes) samples_pbar = tqdm(total=args.num_samples) sample_i = 0 eval_returns = [] for episode_i in range(args.num_episodes): eval_returns.append(0) obs = env.reset() init_observations.append(obs) buffer_preprocess.reset() obs = buffer_preprocess(obs) h_state = agent.reset() done = False while not done: if hasattr(env, c.RENDER) and args.render: env.render() if args.deterministic: action, h_state, act_info = agent.deterministic_action( obs=obs, hidden_state=h_state) else: action, h_state, act_info = agent.compute_action( obs=obs, hidden_state=h_state) if np.random.uniform() < args.mixture_ratio: action = np.random.uniform(config[c.MIN_ACTION], config[c.MAX_ACTION], config[c.ACTION_DIM]) actions.append(action) if config[c.CLIP_ACTION]: action = np.clip(action, a_min=config[c.MIN_ACTION], a_max=config[c.MAX_ACTION]) obs, reward, done, _ = env.step(action) observations.append(obs) rewards.append(reward) dones.append(done) obs = buffer_preprocess(obs) eval_returns[-1] += reward sample_i += 1 samples_pbar.update(1) if sample_i >= args.num_samples: break else: episodes_pbar.update(1) continue break ret_mean = np.mean(eval_returns) ret_std = np.std(eval_returns) ret_max = np.max(eval_returns) ret_min = np.min(eval_returns) print("=" * 100) print("Interacted with {} complete episodes ({} timesteps)".format( episode_i, sample_i)) print("Average Return: {} - Std: {}".format(ret_mean, ret_std)) print("Max Return: {} - Min Return: {}".format(ret_max, ret_min)) for (filename, data) in zip( ("init_obss", "obss", "acts", "rews", "dones"), (init_observations, observations, actions, rewards, dones)): with gzip.open(f"{args.save_path}/{filename}.pkl", "wb") as f: pickle.dump(data, f) with gzip.open(f"{args.save_path}/metadata.pkl", "wb") as f: pickle.dump( { "returns": eval_returns, "min": ret_min, "max": ret_max, "avg": ret_mean, "std": ret_std, **args.__dict__, }, f)
def train_grac(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) # experiment_config[c.MODEL_SETTING][c.KWARGS][c.CEM] = CEMQ(cov_noise_init=experiment_config[c.COV_NOISE_INIT], # cov_noise_end=experiment_config[c.COV_NOISE_END], # cov_noise_tau=experiment_config[c.COV_NOISE_TAU], # action_dim=experiment_config[c.ACTION_DIM], # batch_size=1, # num_iters=experiment_config[c.NUM_ITERS], # pop_size=experiment_config[c.POP_SIZE], # elite_size=experiment_config[c.ELITE_SIZE], # device=experiment_config[c.DEVICE], # min_action=experiment_config[c.MIN_ACTION], # max_action=experiment_config[c.MAX_ACTION]) model = make_model(experiment_config[c.MODEL_SETTING]) buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) # policy_opt = make_optimizer(model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING]) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = GRAC(model=model, policy_opt=policy_opt, qs_opt=qs_opt, buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent( model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.GRAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_sacx_sac_drq(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) intentions = make_model(experiment_config[c.INTENTIONS_SETTING]) policy_opt = make_optimizer(intentions.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.INTENTIONS]) qs_opt = make_optimizer(intentions.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([intentions.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], intentions, buffer, experiment_config) update_intentions = UpdateSACDrQIntentions(model=intentions, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) scheduler = make_model(experiment_config[c.SCHEDULER_SETTING][c.TRAIN]) update_scheduler = UpdateQScheduler(model=scheduler, algo_params=experiment_config) learning_algorithm = SACX(update_scheduler=update_scheduler, update_intentions=update_intentions, algo_params=experiment_config) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = SACXAgent(scheduler=scheduler, intentions=intentions, learning_algorithm=learning_algorithm, scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.TRAIN][c.SCHEDULER_PERIOD], preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = SACXAgent(scheduler=make_model(experiment_config[c.SCHEDULER_SETTING][c.EVALUATION]), intentions=intentions, learning_algorithm=None, scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.EVALUATION][c.SCHEDULER_PERIOD], preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SACX, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)