def make_auxiliary_tasks(tasks, model, buffer, cfg): aux_tasks = dict() if tasks is not None: for task_name, task_setting in tasks.items(): assert task_name not in aux_tasks if task_name == c.KOOPMAN: task_setting[c.MODEL_SETTING][c.KWARGS][c.LAYERS_DIM] = model.encoder.layers_dim decoder = make_model(task_setting[c.MODEL_SETTING]).to(task_setting[c.DEVICE]) dynamics = KoopmanDynamics(z_dim=task_setting[c.Z_DIM], u_dim=task_setting[c.U_DIM], device=task_setting[c.DEVICE]) aux_opt = make_optimizer(list(decoder.parameters()) + list(dynamics.parameters()), task_setting[c.OPTIMIZER_SETTING]) aux_tasks[c.KOOPMAN] = Koopman(rec_dim=task_setting[c.REC_DIM], batch_size=task_setting[c.BATCH_SIZE], decoder=decoder, encoder=model.encoder, dynamics=dynamics, opt=aux_opt, buffer=buffer, algo_params=cfg, reduction=task_setting[c.REDUCTION], loss_coef=task_setting[c.LOSS_COEF], device=task_setting[c.DEVICE]) else: raise NotImplementedError return AuxiliaryTasks(aux_tasks)
def train_sac(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) model = make_model(experiment_config[c.MODEL_SETTING]) buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = SAC(model=model, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent( model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def create_trajectories(args): assert args.num_episodes > 0 assert os.path.isfile(args.model_path) assert os.path.isfile(args.config_path) os.makedirs(os.path.dirname(args.save_path), exist_ok=True) set_seed(args.seed) with open(args.config_path, "rb") as f: config = pickle.load(f) env_setting = config[c.ENV_SETTING] env_setting[c.ENV_WRAPPERS][0][c.KWARGS][c.CREATE_ABSORBING_STATE] = True env_setting[c.ENV_WRAPPERS][0][c.KWARGS][c.MAX_EPISODE_LENGTH] = 1000 env = make_env(env_setting, seed=args.seed) model = make_model(config[c.MODEL_SETTING]) model.load_state_dict(torch.load(args.model_path)[c.STATE_DICT]) agent = ACAgent(model=model, learning_algorithm=None, preprocess=config[c.EVALUATION_PREPROCESSING]) config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = args.num_steps config[c.BUFFER_SETTING][c.STORE_NEXT_OBSERVATION] = True buffer_preprocessing = config[c.BUFFER_PREPROCESSING] expert_buffer = make_buffer(config[c.BUFFER_SETTING], args.seed) config[c.NUM_STEPS] = args.num_steps config[c.NUM_EPISODES] = args.num_episodes def transition_preprocess(obs, h_state, action, reward, done, info, next_obs, next_h_state): if obs[:, -1] == 1: action[:] = 0 return { "obs": obs, "h_state": h_state, "act": action, "rew": [reward], "done": False, "info": info, "next_obs": next_obs, "next_h_state": next_h_state, } buffer_warmup(agent=agent, env=env, buffer=expert_buffer, buffer_preprocess=buffer_preprocessing, transition_preprocess=transition_preprocess, experiment_settings=config) expert_buffer.save(save_path=args.save_path, end_with_done=False)
def load_model(seed, config_path, model_path, device, intention=0): assert os.path.isfile(model_path) assert os.path.isfile(config_path) with open(config_path, "rb") as f: config = pickle.load(f) env_setting = config[c.ENV_SETTING] env = make_env(env_setting, seed=seed) if device is None: device = config[c.DEVICE] else: device = torch.device(device) buffer_preprocessing = config[c.BUFFER_PREPROCESSING] if config[c.ALGO] in (c.SACX,): config[c.INTENTIONS_SETTING][c.KWARGS][c.DEVICE] = device intentions = make_model(config[c.INTENTIONS_SETTING]) intentions_model = torch.load(model_path, map_location=device.type)[c.INTENTIONS] if c.ALGORITHM in intentions_model.keys(): intentions.load_state_dict(intentions_model[c.ALGORITHM][c.STATE_DICT]) else: intentions.load_state_dict(intentions_model[c.STATE_DICT]) scheduler = FixedScheduler(intention_i=intention, num_tasks=config[c.SCHEDULER_SETTING][c.TRAIN][c.KWARGS][c.NUM_TASKS]) agent = SACXAgent(scheduler=scheduler, intentions=intentions, learning_algorithm=None, scheduler_period=c.MAX_INT, preprocess=config[c.EVALUATION_PREPROCESSING]) else: model = make_model(config[c.MODEL_SETTING]) saved_model = torch.load(model_path) if config[c.ALGO] == c.DAC: saved_model = saved_model[c.ALGORITHM] model.load_state_dict(saved_model[c.STATE_DICT]) if hasattr(model, c.OBS_RMS): model.obs_rms = saved_model[c.OBS_RMS] agent = ACAgent(model=model, learning_algorithm=None, preprocess=config[c.EVALUATION_PREPROCESSING]) return config, env, buffer_preprocessing, agent
def train_bc(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) model = make_model(experiment_config[c.MODEL_SETTING]) expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, expert_buffer, experiment_config) learning_algorithm = BC(model=model, optimizer=optimizer, expert_buffer=expert_buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent(model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_sac_diayn(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) model = make_model(experiment_config[c.MODEL_SETTING]) discriminator = make_model(experiment_config[c.DISCRIMINATOR_SETTING]) prior = experiment_config[c.PRIOR] buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) discriminator_opt = make_optimizer( discriminator.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.DISCRIMINATOR]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = SACDIAYN(model=model, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) diayn = DIAYN(discriminator=discriminator, prior=prior, discriminator_opt=discriminator_opt, learning_algorithm=learning_algorithm, algo_params=experiment_config) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = DIAYNAgent( prior=prior, model=model, learning_algorithm=diayn, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = DIAYNAgent( prior=prior, model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) class GetTask: def __init__(self, agent): self.agent = agent def __call__(self, obs): # Concatenate task to the end of observation return np.concatenate((obs, self.agent.curr_high_level_act), axis=-1) def reset(self): pass buffer_preprocessing = Compose([buffer_preprocessing, GetTask(agent)]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_grac(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) # experiment_config[c.MODEL_SETTING][c.KWARGS][c.CEM] = CEMQ(cov_noise_init=experiment_config[c.COV_NOISE_INIT], # cov_noise_end=experiment_config[c.COV_NOISE_END], # cov_noise_tau=experiment_config[c.COV_NOISE_TAU], # action_dim=experiment_config[c.ACTION_DIM], # batch_size=1, # num_iters=experiment_config[c.NUM_ITERS], # pop_size=experiment_config[c.POP_SIZE], # elite_size=experiment_config[c.ELITE_SIZE], # device=experiment_config[c.DEVICE], # min_action=experiment_config[c.MIN_ACTION], # max_action=experiment_config[c.MAX_ACTION]) model = make_model(experiment_config[c.MODEL_SETTING]) buffer = make_buffer( experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) # policy_opt = make_optimizer(model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING]) policy_opt = make_optimizer( model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], model, buffer, experiment_config) learning_algorithm = GRAC(model=model, policy_opt=policy_opt, qs_opt=qs_opt, buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = ACAgent(model=model, learning_algorithm=learning_algorithm, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = ACAgent( model=model, learning_algorithm=None, preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.GRAC, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)
def train_sacx_sac_drq(experiment_config): seed = experiment_config[c.SEED] save_path = experiment_config.get(c.SAVE_PATH, None) buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) set_seed(seed) train_env = make_env(experiment_config[c.ENV_SETTING], seed) buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False)) intentions = make_model(experiment_config[c.INTENTIONS_SETTING]) policy_opt = make_optimizer(intentions.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.INTENTIONS]) qs_opt = make_optimizer(intentions.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) alpha_opt = make_optimizer([intentions.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], intentions, buffer, experiment_config) update_intentions = UpdateSACDrQIntentions(model=intentions, policy_opt=policy_opt, qs_opt=qs_opt, alpha_opt=alpha_opt, learn_alpha=experiment_config[c.LEARN_ALPHA], buffer=buffer, algo_params=experiment_config, aux_tasks=aux_tasks) scheduler = make_model(experiment_config[c.SCHEDULER_SETTING][c.TRAIN]) update_scheduler = UpdateQScheduler(model=scheduler, algo_params=experiment_config) learning_algorithm = SACX(update_scheduler=update_scheduler, update_intentions=update_intentions, algo_params=experiment_config) load_model = experiment_config.get(c.LOAD_MODEL, False) if load_model: learning_algorithm.load_state_dict(torch.load(load_model)) agent = SACXAgent(scheduler=scheduler, intentions=intentions, learning_algorithm=learning_algorithm, scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.TRAIN][c.SCHEDULER_PERIOD], preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) evaluation_env = None evaluation_agent = None if experiment_config.get(c.EVALUATION_FREQUENCY, 0): evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) evaluation_agent = SACXAgent(scheduler=make_model(experiment_config[c.SCHEDULER_SETTING][c.EVALUATION]), intentions=intentions, learning_algorithm=None, scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.EVALUATION][c.SCHEDULER_PERIOD], preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SACX, cfg=experiment_config) train(agent=agent, evaluation_agent=evaluation_agent, train_env=train_env, evaluation_env=evaluation_env, buffer_preprocess=buffer_preprocessing, auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward, experiment_settings=experiment_config, summary_writer=summary_writer, save_path=save_path)