コード例 #1
0
def create_trajectories(args):
    assert args.num_episodes > 0
    assert os.path.isfile(args.model_path)
    assert os.path.isfile(args.config_path)
    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)

    set_seed(args.seed)
    with open(args.config_path, "rb") as f:
        config = pickle.load(f)

    env_setting = config[c.ENV_SETTING]
    env_setting[c.ENV_WRAPPERS][0][c.KWARGS][c.CREATE_ABSORBING_STATE] = True
    env_setting[c.ENV_WRAPPERS][0][c.KWARGS][c.MAX_EPISODE_LENGTH] = 1000
    env = make_env(env_setting, seed=args.seed)
    model = make_model(config[c.MODEL_SETTING])
    model.load_state_dict(torch.load(args.model_path)[c.STATE_DICT])
    
    agent = ACAgent(model=model,
                    learning_algorithm=None,
                    preprocess=config[c.EVALUATION_PREPROCESSING])

    config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = args.num_steps
    config[c.BUFFER_SETTING][c.STORE_NEXT_OBSERVATION] = True
    buffer_preprocessing = config[c.BUFFER_PREPROCESSING]

    expert_buffer = make_buffer(config[c.BUFFER_SETTING], args.seed)

    config[c.NUM_STEPS] = args.num_steps
    config[c.NUM_EPISODES] = args.num_episodes

    def transition_preprocess(obs,
                              h_state,
                              action,
                              reward,
                              done,
                              info,
                              next_obs,
                              next_h_state):
        if obs[:, -1] == 1:
            action[:] = 0

        return {
            "obs": obs,
            "h_state": h_state,
            "act": action,
            "rew": [reward],
            "done": False,
            "info": info,
            "next_obs": next_obs,
            "next_h_state": next_h_state,
        }

    buffer_warmup(agent=agent,
                  env=env,
                  buffer=expert_buffer,
                  buffer_preprocess=buffer_preprocessing,
                  transition_preprocess=transition_preprocess,
                  experiment_settings=config)

    expert_buffer.save(save_path=args.save_path, end_with_done=False)
コード例 #2
0
ファイル: train_sac.py プロジェクト: chanb/rl_sandbox_public
def train_sac(experiment_config):
    seed = experiment_config[c.SEED]
    save_path = experiment_config.get(c.SAVE_PATH, None)
    buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING,
                                                 Identity())

    set_seed(seed)
    train_env = make_env(experiment_config[c.ENV_SETTING], seed)
    model = make_model(experiment_config[c.MODEL_SETTING])
    buffer = make_buffer(
        experiment_config[c.BUFFER_SETTING], seed,
        experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False))

    policy_opt = make_optimizer(
        model.policy_parameters,
        experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
    qs_opt = make_optimizer(model.qs_parameters,
                            experiment_config[c.OPTIMIZER_SETTING][c.QS])
    alpha_opt = make_optimizer([model.log_alpha],
                               experiment_config[c.OPTIMIZER_SETTING][c.ALPHA])

    aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
                                     model, buffer, experiment_config)

    learning_algorithm = SAC(model=model,
                             policy_opt=policy_opt,
                             qs_opt=qs_opt,
                             alpha_opt=alpha_opt,
                             learn_alpha=experiment_config[c.LEARN_ALPHA],
                             buffer=buffer,
                             algo_params=experiment_config,
                             aux_tasks=aux_tasks)

    load_model = experiment_config.get(c.LOAD_MODEL, False)
    if load_model:
        learning_algorithm.load_state_dict(torch.load(load_model))

    agent = ACAgent(model=model,
                    learning_algorithm=learning_algorithm,
                    preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
    evaluation_env = None
    evaluation_agent = None
    if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
        evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
        evaluation_agent = ACAgent(
            model=model,
            learning_algorithm=None,
            preprocess=experiment_config[c.EVALUATION_PREPROCESSING])

    summary_writer, save_path = make_summary_writer(save_path=save_path,
                                                    algo=c.SAC,
                                                    cfg=experiment_config)
    train(agent=agent,
          evaluation_agent=evaluation_agent,
          train_env=train_env,
          evaluation_env=evaluation_env,
          buffer_preprocess=buffer_preprocessing,
          experiment_settings=experiment_config,
          summary_writer=summary_writer,
          save_path=save_path)
コード例 #3
0
ファイル: evaluate.py プロジェクト: chanb/rl_sandbox_public
def evaluate(args):
    set_seed(args.seed)
    assert args.num_episodes > 0

    config, env, buffer_preprocessing, agent = load_model(
        args.seed, args.config_path, args.model_path, args.device,
        args.intention)
    if c.AUXILIARY_REWARDS in config:
        auxiliary_reward = config[c.AUXILIARY_REWARDS].reward
    else:
        auxiliary_reward = lambda reward, **kwargs: np.array([reward])

    rets = evaluate_policy(
        agent=agent,
        env=env,
        buffer_preprocess=buffer_preprocessing,
        num_episodes=args.num_episodes,
        clip_action=config[c.CLIP_ACTION],
        min_action=config[c.MIN_ACTION],
        max_action=config[c.MAX_ACTION],
        render=args.render,
        auxiliary_reward=auxiliary_reward,
        verbose=True,
    )

    print("=" * 100)
    print("Interacted with {} episodes".format(args.num_episodes))
    print("Average Return: {} - Std: {}".format(np.mean(rets, axis=1),
                                                np.std(rets, axis=1)))
コード例 #4
0
def train_bc(experiment_config):
    seed = experiment_config[c.SEED]
    save_path = experiment_config.get(c.SAVE_PATH, None)
    buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())

    set_seed(seed)
    train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM])
    model = make_model(experiment_config[c.MODEL_SETTING])
    expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False))
    optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY])

    aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
                                     model,
                                     expert_buffer,
                                     experiment_config)

    learning_algorithm = BC(model=model,
                            optimizer=optimizer,
                            expert_buffer=expert_buffer,
                            algo_params=experiment_config,
                            aux_tasks=aux_tasks)

    load_model = experiment_config.get(c.LOAD_MODEL, False)
    if load_model:
        learning_algorithm.load_state_dict(torch.load(load_model))

    agent = ACAgent(model=model,
                    learning_algorithm=learning_algorithm,
                    preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
    evaluation_env = None
    evaluation_agent = None
    if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
        evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
        evaluation_agent = ACAgent(model=model,
                                   learning_algorithm=None,
                                   preprocess=experiment_config[c.EVALUATION_PREPROCESSING])

    summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config)
    train(agent=agent,
          evaluation_agent=evaluation_agent,
          train_env=train_env,
          evaluation_env=evaluation_env,
          buffer_preprocess=buffer_preprocessing,
          experiment_settings=experiment_config,
          summary_writer=summary_writer,
          save_path=save_path)
コード例 #5
0
def train_sac_diayn(experiment_config):
    seed = experiment_config[c.SEED]
    save_path = experiment_config.get(c.SAVE_PATH, None)
    buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING,
                                                 Identity())

    set_seed(seed)
    train_env = make_env(experiment_config[c.ENV_SETTING], seed)
    model = make_model(experiment_config[c.MODEL_SETTING])
    discriminator = make_model(experiment_config[c.DISCRIMINATOR_SETTING])
    prior = experiment_config[c.PRIOR]
    buffer = make_buffer(
        experiment_config[c.BUFFER_SETTING], seed,
        experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False))

    policy_opt = make_optimizer(
        model.policy_parameters,
        experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
    qs_opt = make_optimizer(model.qs_parameters,
                            experiment_config[c.OPTIMIZER_SETTING][c.QS])
    alpha_opt = make_optimizer([model.log_alpha],
                               experiment_config[c.OPTIMIZER_SETTING][c.ALPHA])
    discriminator_opt = make_optimizer(
        discriminator.parameters(),
        experiment_config[c.OPTIMIZER_SETTING][c.DISCRIMINATOR])

    aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
                                     model, buffer, experiment_config)

    learning_algorithm = SACDIAYN(model=model,
                                  policy_opt=policy_opt,
                                  qs_opt=qs_opt,
                                  alpha_opt=alpha_opt,
                                  learn_alpha=experiment_config[c.LEARN_ALPHA],
                                  buffer=buffer,
                                  algo_params=experiment_config,
                                  aux_tasks=aux_tasks)

    diayn = DIAYN(discriminator=discriminator,
                  prior=prior,
                  discriminator_opt=discriminator_opt,
                  learning_algorithm=learning_algorithm,
                  algo_params=experiment_config)

    load_model = experiment_config.get(c.LOAD_MODEL, False)
    if load_model:
        learning_algorithm.load_state_dict(torch.load(load_model))

    agent = DIAYNAgent(
        prior=prior,
        model=model,
        learning_algorithm=diayn,
        preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
    evaluation_env = None
    evaluation_agent = None
    if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
        evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
        evaluation_agent = DIAYNAgent(
            prior=prior,
            model=model,
            learning_algorithm=None,
            preprocess=experiment_config[c.EVALUATION_PREPROCESSING])

    class GetTask:
        def __init__(self, agent):
            self.agent = agent

        def __call__(self, obs):
            # Concatenate task to the end of observation
            return np.concatenate((obs, self.agent.curr_high_level_act),
                                  axis=-1)

        def reset(self):
            pass

    buffer_preprocessing = Compose([buffer_preprocessing, GetTask(agent)])

    summary_writer, save_path = make_summary_writer(save_path=save_path,
                                                    algo=c.SAC,
                                                    cfg=experiment_config)
    train(agent=agent,
          evaluation_agent=evaluation_agent,
          train_env=train_env,
          evaluation_env=evaluation_env,
          buffer_preprocess=buffer_preprocessing,
          experiment_settings=experiment_config,
          summary_writer=summary_writer,
          save_path=save_path)
コード例 #6
0
def collect_data(args):
    set_seed(args.seed)
    assert args.num_episodes > 0
    assert args.num_samples > 0
    assert 0 <= args.mixture_ratio <= 1

    dir_exists = os.path.isdir(args.save_path)
    assert dir_exists or not os.path.exists(args.save_path)

    if not dir_exists:
        os.makedirs(args.save_path, exist_ok=True)

    config, env, buffer_preprocess, agent = load_model(args.seed,
                                                       args.config_path,
                                                       args.model_path,
                                                       args.device,
                                                       args.intention)

    init_observations = []
    observations = []
    actions = []
    rewards = []
    dones = []

    episodes_pbar = tqdm(total=args.num_episodes)
    samples_pbar = tqdm(total=args.num_samples)

    sample_i = 0
    eval_returns = []
    for episode_i in range(args.num_episodes):
        eval_returns.append(0)
        obs = env.reset()

        init_observations.append(obs)

        buffer_preprocess.reset()
        obs = buffer_preprocess(obs)
        h_state = agent.reset()
        done = False

        while not done:
            if hasattr(env, c.RENDER) and args.render:
                env.render()

            if args.deterministic:
                action, h_state, act_info = agent.deterministic_action(
                    obs=obs, hidden_state=h_state)
            else:
                action, h_state, act_info = agent.compute_action(
                    obs=obs, hidden_state=h_state)

            if np.random.uniform() < args.mixture_ratio:
                action = np.random.uniform(config[c.MIN_ACTION],
                                           config[c.MAX_ACTION],
                                           config[c.ACTION_DIM])

            actions.append(action)

            if config[c.CLIP_ACTION]:
                action = np.clip(action,
                                 a_min=config[c.MIN_ACTION],
                                 a_max=config[c.MAX_ACTION])

            obs, reward, done, _ = env.step(action)

            observations.append(obs)
            rewards.append(reward)
            dones.append(done)
            obs = buffer_preprocess(obs)

            eval_returns[-1] += reward
            sample_i += 1
            samples_pbar.update(1)
            if sample_i >= args.num_samples:
                break
        else:
            episodes_pbar.update(1)
            continue
        break

    ret_mean = np.mean(eval_returns)
    ret_std = np.std(eval_returns)
    ret_max = np.max(eval_returns)
    ret_min = np.min(eval_returns)

    print("=" * 100)
    print("Interacted with {} complete episodes ({} timesteps)".format(
        episode_i, sample_i))
    print("Average Return: {} - Std: {}".format(ret_mean, ret_std))
    print("Max Return: {} - Min Return: {}".format(ret_max, ret_min))

    for (filename, data) in zip(
        ("init_obss", "obss", "acts", "rews", "dones"),
        (init_observations, observations, actions, rewards, dones)):
        with gzip.open(f"{args.save_path}/{filename}.pkl", "wb") as f:
            pickle.dump(data, f)

    with gzip.open(f"{args.save_path}/metadata.pkl", "wb") as f:
        pickle.dump(
            {
                "returns": eval_returns,
                "min": ret_min,
                "max": ret_max,
                "avg": ret_mean,
                "std": ret_std,
                **args.__dict__,
            }, f)
コード例 #7
0
ファイル: train_grac.py プロジェクト: chanb/rl_sandbox_public
def train_grac(experiment_config):
    seed = experiment_config[c.SEED]
    save_path = experiment_config.get(c.SAVE_PATH, None)
    buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING,
                                                 Identity())

    set_seed(seed)
    train_env = make_env(experiment_config[c.ENV_SETTING], seed)
    # experiment_config[c.MODEL_SETTING][c.KWARGS][c.CEM] = CEMQ(cov_noise_init=experiment_config[c.COV_NOISE_INIT],
    #                                                            cov_noise_end=experiment_config[c.COV_NOISE_END],
    #                                                            cov_noise_tau=experiment_config[c.COV_NOISE_TAU],
    #                                                            action_dim=experiment_config[c.ACTION_DIM],
    #                                                            batch_size=1,
    #                                                            num_iters=experiment_config[c.NUM_ITERS],
    #                                                            pop_size=experiment_config[c.POP_SIZE],
    #                                                            elite_size=experiment_config[c.ELITE_SIZE],
    #                                                            device=experiment_config[c.DEVICE],
    #                                                            min_action=experiment_config[c.MIN_ACTION],
    #                                                            max_action=experiment_config[c.MAX_ACTION])
    model = make_model(experiment_config[c.MODEL_SETTING])
    buffer = make_buffer(
        experiment_config[c.BUFFER_SETTING], seed,
        experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False))

    # policy_opt = make_optimizer(model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING])
    policy_opt = make_optimizer(
        model.policy_parameters,
        experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
    qs_opt = make_optimizer(model.qs_parameters,
                            experiment_config[c.OPTIMIZER_SETTING][c.QS])

    aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
                                     model, buffer, experiment_config)

    learning_algorithm = GRAC(model=model,
                              policy_opt=policy_opt,
                              qs_opt=qs_opt,
                              buffer=buffer,
                              algo_params=experiment_config,
                              aux_tasks=aux_tasks)

    load_model = experiment_config.get(c.LOAD_MODEL, False)
    if load_model:
        learning_algorithm.load_state_dict(torch.load(load_model))

    agent = ACAgent(model=model,
                    learning_algorithm=learning_algorithm,
                    preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
    evaluation_env = None
    evaluation_agent = None
    if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
        evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
        evaluation_agent = ACAgent(
            model=model,
            learning_algorithm=None,
            preprocess=experiment_config[c.EVALUATION_PREPROCESSING])

    summary_writer, save_path = make_summary_writer(save_path=save_path,
                                                    algo=c.GRAC,
                                                    cfg=experiment_config)
    train(agent=agent,
          evaluation_agent=evaluation_agent,
          train_env=train_env,
          evaluation_env=evaluation_env,
          buffer_preprocess=buffer_preprocessing,
          experiment_settings=experiment_config,
          summary_writer=summary_writer,
          save_path=save_path)
コード例 #8
0
def train_sacx_sac_drq(experiment_config):
    seed = experiment_config[c.SEED]
    save_path = experiment_config.get(c.SAVE_PATH, None)
    buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())

    set_seed(seed)
    train_env = make_env(experiment_config[c.ENV_SETTING], seed)
    buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False))

    intentions = make_model(experiment_config[c.INTENTIONS_SETTING])
    policy_opt = make_optimizer(intentions.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.INTENTIONS])
    qs_opt = make_optimizer(intentions.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS])
    alpha_opt = make_optimizer([intentions.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA])

    aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
                                     intentions,
                                     buffer,
                                     experiment_config)

    update_intentions = UpdateSACDrQIntentions(model=intentions,
                                               policy_opt=policy_opt,
                                               qs_opt=qs_opt,
                                               alpha_opt=alpha_opt,
                                               learn_alpha=experiment_config[c.LEARN_ALPHA],
                                               buffer=buffer,
                                               algo_params=experiment_config,
                                               aux_tasks=aux_tasks)

    scheduler = make_model(experiment_config[c.SCHEDULER_SETTING][c.TRAIN])
    update_scheduler = UpdateQScheduler(model=scheduler,
                                        algo_params=experiment_config)

    learning_algorithm = SACX(update_scheduler=update_scheduler,
                              update_intentions=update_intentions,
                              algo_params=experiment_config)

    load_model = experiment_config.get(c.LOAD_MODEL, False)
    if load_model:
        learning_algorithm.load_state_dict(torch.load(load_model))

    agent = SACXAgent(scheduler=scheduler,
                      intentions=intentions,
                      learning_algorithm=learning_algorithm,
                      scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.TRAIN][c.SCHEDULER_PERIOD],
                      preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
    evaluation_env = None
    evaluation_agent = None
    if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
        evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
        evaluation_agent = SACXAgent(scheduler=make_model(experiment_config[c.SCHEDULER_SETTING][c.EVALUATION]),
                                     intentions=intentions,
                                     learning_algorithm=None,
                                     scheduler_period=experiment_config[c.SCHEDULER_SETTING][c.EVALUATION][c.SCHEDULER_PERIOD],
                                     preprocess=experiment_config[c.EVALUATION_PREPROCESSING])

    summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.SACX, cfg=experiment_config)
    train(agent=agent,
          evaluation_agent=evaluation_agent,
          train_env=train_env,
          evaluation_env=evaluation_env,
          buffer_preprocess=buffer_preprocessing,
          auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward,
          experiment_settings=experiment_config,
          summary_writer=summary_writer,
          save_path=save_path)