Ejemplo n.º 1
0
def experiment(specs):
    with open(path.join(specs['specific_exp_dir'], 'variant.json'), 'r') as f:
        variant = json.load(f)
    variant['algo_params']['do_not_train'] = True
    variant['seed'] = specs['seed']
    policy = joblib.load(path.join(specs['specific_exp_dir'],
                                   'params.pkl'))['exploration_policy']

    assert False, 'Do you really wanna make it deterministic?'
    policy = MakeDeterministic(policy)

    env_specs = variant['env_specs']
    env, _ = get_env(env_specs)
    training_env, _ = get_env(env_specs)

    variant['algo_params']['replay_buffer_size'] = int(
        np.floor(specs['num_episodes'] *
                 variant['algo_params']['max_path_length'] /
                 specs['subsampling']))
    # Hack until I figure out how things are gonna be in general then I'll clean it up
    if 'policy_uses_pixels' not in variant['algo_params']:
        variant['algo_params']['policy_uses_pixels'] = False
    if 'policy_uses_task_params' not in variant['algo_params']:
        variant['algo_params']['policy_uses_task_params'] = False
    if 'concat_task_params_to_policy_obs' not in variant['algo_params']:
        variant['algo_params']['concat_task_params_to_policy_obs'] = False
    replay_buffer = ExpertReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        env,
        subsampling=specs['subsampling'],
        policy_uses_pixels=variant['algo_params']['policy_uses_pixels'],
        policy_uses_task_params=variant['algo_params']
        ['policy_uses_task_params'],
        concat_task_params_to_policy_obs=variant['algo_params']
        ['concat_task_params_to_policy_obs'],
    )
    variant['algo_params']['freq_saving'] = 1

    algorithm = ExpertTrajGeneratorAlgorithm(
        env=env,
        training_env=training_env,
        exploration_policy=policy,
        replay_buffer=replay_buffer,
        max_num_episodes=specs['num_episodes'],
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 2
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    print(demos_path)
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    # target_state_buffer /= variant['rescale']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    policy = joblib.load(variant['policy_checkpoint'])['exploration_policy']
    if variant['eval_deterministic']:
        policy = MakeDeterministic(policy)
    policy.to(ptu.device)

    eval_sampler = PathSampler(env,
                               policy,
                               variant['num_eval_steps'],
                               variant['max_path_length'],
                               no_terminal=variant['no_terminal'],
                               render=variant['render'],
                               render_kwargs=variant['render_kwargs'])
    test_paths = eval_sampler.obtain_samples()
    obs = []
    for path in test_paths:
        obs += path['observations']
    x = [o[0] for o in obs]
    y = [o[1] for o in obs]

    fig, ax = plt.subplots(figsize=(6, 6))
    plt.scatter(x, y)
    plt.xlim(-1.25, 20)
    plt.ylim(-1.25, 10)
    ax.set_yticks([0, 5, 10])
    ax.set_xticks([0, 5, 10, 15, 20])
    plt.savefig('./figs/' + variant['env_specs']['task_name'] + '.pdf',
                bbox_inches='tight')

    return 1
Ejemplo n.º 3
0
def experiment(variant):
    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        with open('expert_demos_listing.yaml', 'r') as f:
            listings = yaml.load(f.read())
        expert_demos_path = listings[variant['expert_name']]['file_paths'][
            variant['expert_idx']]
        buffer_save_dict = joblib.load(expert_demos_path)
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    policy = joblib.load(variant['policy_checkpoint'])['exploration_policy']
    if variant['eval_deterministic']:
        policy = MakeDeterministic(policy)
    policy.to(ptu.device)

    eval_sampler = PathSampler(env,
                               policy,
                               variant['num_eval_steps'],
                               variant['max_path_length'],
                               no_terminal=variant['no_terminal'],
                               render=variant['render'],
                               render_kwargs=variant['render_kwargs'])
    test_paths = eval_sampler.obtain_samples()
    average_returns = eval_util.get_average_returns(test_paths)
    print(average_returns)

    return 1
Ejemplo n.º 4
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning GAIL
    expert_buffer = extra_data['train']

    # set up the env
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])
    print(env.observation_space)

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        env = ScaledEnv(
            env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # compute obs_dim and action_dim
    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                if variant['algo_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(
                        np.prod(env.observation_space.
                                spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    print(obs_dim, action_dim)

    sleep(3)

    # set up the policy models
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # set up the discriminator models
    disc_model = StandardAIRLDisc(
        obs_dim + action_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)

    # set up the AdvBC algorithm
    algorithm = AdvBC(env,
                      policy,
                      disc_model,
                      expert_buffer,
                      training_env=training_env,
                      wrap_absorbing=variant['wrap_absorbing_state'],
                      **variant['algo_params'])
    print(algorithm.use_target_disc)
    print(algorithm.soft_target_disc_tau)
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)
    print(algorithm.disc_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.defaults['lr'])

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 5
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    print(demos_path)
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    # target_state_buffer /= variant['rescale']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-' + variant['env_specs'][
            'env_name'] + '-' + str(
                variant['expert_traj_num']) + '-train--sigma-' + str(
                    variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        load_ebm_dir = ebm_dir
        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    else:
        raise NotImplementedError

    # Test
    if variant['test']:
        batch_data = target_state_buffer / variant['rescale']
        obs = torch.Tensor(batch_data[:1000]).to(ptu.device)
        print("Not expert data", ebm_model(obs * 200).mean().item())
        print("Expert data", ebm_model(obs).mean().item())
        exit(1)

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = EBIL(env=env,
                     training_env=training_env,
                     exploration_policy=policy,
                     rew_func=variant['rew_func'],
                     cons=variant['cons'],
                     rescale=variant['rescale'],
                     ebm=ebm_model,
                     policy_trainer=trainer,
                     target_state_buffer=target_state_buffer,
                     state_indices=state_indices,
                     **variant['ebil_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Ejemplo n.º 6
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    if 'minmax_env_with_demo_stats' in variant.keys():
        if variant['minmax_env_with_demo_stats']:
            assert 'norm_train' in buffer_save_dict.keys()
            expert_replay_buffer = buffer_save_dict['norm_train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
    elif variant['minmax_env_with_demo_stats']:
        env = MinmaxEnv(
            env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )
        training_env = MinmaxEnv(
            training_env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the critic model
    critic_model = MLPDisc(variant['policy_net_size'],
                           num_layer_blocks=variant['critic_num_blocks'],
                           hid_dim=variant['critic_hid_dim'],
                           hid_act=variant['critic_hid_act'],
                           use_bn=variant['critic_use_bn'])

    algorithm = BC(env=env,
                   training_env=training_env,
                   exploration_policy=policy,
                   critic=critic_model,
                   expert_replay_buffer=expert_replay_buffer,
                   **variant['adp_bc_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)

    algorithm.train()

    return 1
Ejemplo n.º 7
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = BC(env=env,
                   training_env=training_env,
                   exploration_policy=policy,
                   expert_replay_buffer=expert_replay_buffer,
                   **variant['bc_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Ejemplo n.º 8
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    if 'minmax_env_with_demo_stats' in variant.keys():
        if variant['minmax_env_with_demo_stats']:
            print('Use minmax envs')
            assert 'norm_train' in buffer_save_dict.keys()
            expert_replay_buffer = buffer_save_dict['norm_train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))
    
    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
    elif variant['minmax_env_with_demo_stats']:
        env = MinmaxEnv(
            env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )
        training_env = MinmaxEnv(
            training_env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )


    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1
    
    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    
    input_dim = obs_dim + action_dim if not variant['ebm_params']['state_only'] else 2*obs_dim

    # build the energy model
    if (variant['ebm_params']['mode']) == 'deen':
        ebm_model = MLPEBM(
            input_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude']
        )

        algorithm = EBMLearn(
            env=env,
            training_env=training_env,
            ebm=ebm_model,
            input_dim = input_dim,
            exploration_policy=policy,
            sigma=variant['sigma'],

            expert_replay_buffer=expert_replay_buffer,
            **variant['ebm_params']
        )
    
    # build the energy model
    elif (variant['ebm_params']['mode']) == 'ae':
        ebm_model = MLPAE(
            input_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
        )

        algorithm = EBMLearn(
            env=env,
            training_env=training_env,
            ebm=ebm_model,
            input_dim = input_dim,
            exploration_policy=policy,
            sigma=None,

            expert_replay_buffer=expert_replay_buffer,
            **variant['ebm_params']
        )

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
def experiment(variant):
    expert_buffer = joblib.load(variant['xy_data_path'])['xy_data']

    # set up the env
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])
    print(env.observation_space)

    if variant['scale_env_with_given_demo_stats']:
        assert False
        assert not env_specs['normalized']
        env = ScaledEnv(
            env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # compute obs_dim and action_dim
    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                if variant['algo_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(
                        np.prod(env.observation_space.
                                spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    print(obs_dim, action_dim)

    sleep(3)

    # set up the policy models
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        # policy = ReparamMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
        # std=0.1
    )

    # set up the discriminator models
    disc_model_class = ThreeWayResNetAIRLDisc if variant[
        'threeway'] else ResNetAIRLDisc
    disc_model = disc_model_class(
        2,  # obs is just x-y pos
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)

    # set up the RL algorithm used to train the policy
    policy_optimizer = EntConstSAC(policy=policy,
                                   qf1=qf1,
                                   qf2=qf2,
                                   target_qf1=target_qf1,
                                   target_qf2=target_qf2,
                                   action_dim=action_dim,
                                   **variant['policy_params'])

    # set up the AIRL algorithm
    alg_class = ThreewayStateMarginalMatchingAlg if variant[
        'threeway'] else StateMarginalMatchingAlg
    algorithm = alg_class(env,
                          policy,
                          disc_model,
                          policy_optimizer,
                          expert_buffer,
                          training_env=training_env,
                          **variant['algo_params'])
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)
    print(algorithm.policy_optimizer.policy_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr'])
    print(algorithm.disc_optimizer.defaults['lr'])

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 10
0
def experiment(variant):
    # NEW WAY OF DOING EXPERT REPLAY BUFFERS USING ExpertReplayBuffer class
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    print(listings.keys())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    expert_replay_buffer = joblib.load(file_to_load)['replay_buffer']
    # this script is for the non-meta-learning GAIL
    expert_replay_buffer.policy_uses_task_params = variant['gail_params']['policy_uses_task_params']
    expert_replay_buffer.concat_task_params_to_policy_obs = variant['gail_params']['concat_task_params_to_policy_obs']

    # Now determine how many trajectories you want to use
    if 'num_expert_trajs' in variant: raise NotImplementedError('Not implemented during the transition away from ExpertReplayBuffer')
    
    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])

    # if variant['wrap_absorbing_state']:
    #     assert False, 'Not handling train_test_env'
    #     env = WrappedAbsorbingEnv(env)

    print(env.observation_space)

    if isinstance(env.observation_space, Dict):
        if not variant['gail_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['gail_params']['policy_uses_task_params']:
                if variant['gail_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(np.prod(env.observation_space.spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    print(obs_dim, action_dim)
    sleep(3)

    if variant['gail_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n')
    disc_model = ThirdVersionSingleColorFetchCustomDisc(
        clamp_magnitude=variant['disc_clamp_magnitude'],
        state_only=variant['gail_params']['state_only'],
        wrap_absorbing=variant['gail_params']['wrap_absorbing']
    )
    if variant['gail_params']['use_target_disc']:
        target_disc = disc_model.copy()
    else:
        target_disc = None
    print(disc_model)
    print(disc_model.clamp_magnitude)

    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = ObsPreprocessedQFunc(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 4 + 1*variant['gail_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['gail_params']['wrap_absorbing']
    )
    qf2 = ObsPreprocessedQFunc(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 4 + 1*variant['gail_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['gail_params']['wrap_absorbing']
    )
    vf = ObsPreprocessedVFunc(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 1*variant['gail_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['gail_params']['wrap_absorbing']
    )
    policy = ObsPreprocessedReparamTanhMultivariateGaussianPolicy(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        hidden_sizes=hidden_sizes,
        obs_dim=6 + 4,
        action_dim=4,
    )

    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        wrap_absorbing=variant['gail_params']['wrap_absorbing'],
        **variant['policy_params']
    )
    algorithm = GAIL(
        env,
        policy,
        disc_model,
        policy_optimizer,
        expert_replay_buffer,
        training_env=training_env,
        target_disc=target_disc,
        **variant['gail_params']
    )
    print(algorithm.use_target_disc)
    print(algorithm.soft_target_disc_tau)
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)
    print(algorithm.policy_optimizer.policy_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.vf_optimizer.defaults['lr'])
    print(algorithm.disc_optimizer.defaults['lr'])

    if variant['gail_params']['wrap_absorbing']:
        print('\n\nWRAP ABSORBING\n\n')
    
    # assert False, "Have not added new sac yet!"
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 11
0
def experiment(variant):
    # get the expert data
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)
    expert_buffer = extra_data['train']

    # set up the env
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        env = ScaledEnv(
            env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # seed the env
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])
    print(env.observation_space)

    # compute obs_dim and action_dim
    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                if variant['algo_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(
                        np.prod(env.observation_space.
                                spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    print(obs_dim, action_dim)

    sleep(3)

    # set up the policy models
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers']

    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
        batch_norm=variant['policy_uses_bn'],
        layer_norm=variant['policy_uses_layer_norm'])
    # policy = MlpPolicy(
    #     hidden_sizes=hidden_sizes,
    #     obs_dim=obs_dim,
    #     action_dim=action_dim,
    #     batch_norm=variant['policy_uses_bn'],
    #     layer_norm=variant['policy_uses_layer_norm']
    # )

    # set up the AIRL algorithm
    algorithm = BC(env,
                   policy,
                   expert_buffer,
                   training_env=training_env,
                   wrap_absorbing=variant['wrap_absorbing_state'],
                   **variant['algo_params'])
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 12
0
def experiment(variant):
    env_specs = variant['env_specs']
    if variant['algo_params']['meta']:
        env, training_env = get_meta_env(env_specs)
    else:
        if env_specs['train_test_env']:
            env, training_env = get_env(env_specs)
        else:
            env, _ = get_env(env_specs)
            training_env, _ = get_env(env_specs)

    if variant['algo_params']['meta']:
        train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
            env_specs)

    print(env.observation_space)

    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                obs_dim += int(
                    np.prod(
                        env.observation_space.spaces['obs_task_params'].shape))
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    hidden_sizes = [net_size] * variant['num_hidden_layers']
    if variant['use_custom_ant_models']:
        assert isinstance(env.observation_space, Dict)
        print('CUSTOM ANT WITH LINEAR EMBEDDING OF THE TARGET POSITION')
        qf1 = AntRandGoalCustomQFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape))
            + action_dim,
            output_size=1,
        )
        qf2 = AntRandGoalCustomQFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape))
            + action_dim,
            output_size=1,
        )
        vf = AntRandGoalCustomVFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape)),
            output_size=1,
        )
        policy = AntRandGoalCustomReparamTanhMultivariateGaussianPolicy(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            obs_dim=int(np.prod(env.observation_space.spaces['obs'].shape)),
            action_dim=action_dim,
        )

        # CUSTOM ANT WITH GATING ACTIVATIONS OF EACH LAYER
        # qf1 = AntCustomGatingQFuncV1()
        # qf2 = AntCustomGatingQFuncV1()
        # vf = AntCustomGatingVFuncV1()
        # policy = AntCustomGatingV1ReparamTanhMultivariateGaussianPolicy()
    else:
        print('Using simple model')
        qf1 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        qf2 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        vf = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim,
            output_size=1,
        )
        policy = ReparamTanhMultivariateGaussianPolicy(
            hidden_sizes=hidden_sizes,
            obs_dim=obs_dim,
            action_dim=action_dim,
        )

    if variant['algo_params']['meta']:
        algorithm = MetaNewSoftActorCritic(
            env=env,
            training_env=training_env,
            policy=policy,
            qf1=qf1,
            qf2=qf2,
            vf=vf,
            train_task_params_sampler=train_task_params_sampler,
            test_task_params_sampler=test_task_params_sampler,
            true_env_obs_dim=int(
                np.prod(env.observation_space.spaces['obs'].shape)),
            **variant['algo_params'])
    else:
        algorithm = NewSoftActorCritic(env=env,
                                       training_env=training_env,
                                       policy=policy,
                                       qf1=qf1,
                                       qf2=qf2,
                                       vf=vf,
                                       **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 13
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    target_state_buffer /= variant['rescale']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    input_dim = len(state_indices)

    # build the energy model
    if (variant['ebm_params']['mode']) == 'deen':
        ebm_model = MLPEBM(input_dim,
                           num_layer_blocks=variant['ebm_num_blocks'],
                           hid_dim=variant['ebm_hid_dim'],
                           hid_act=variant['ebm_hid_act'],
                           use_bn=variant['ebm_use_bn'],
                           clamp_magnitude=variant['ebm_clamp_magnitude'])

        algorithm = EBMLearn(env=env,
                             training_env=training_env,
                             ebm=ebm_model,
                             input_dim=input_dim,
                             exploration_policy=policy,
                             sigma=variant['sigma'],
                             target_state_buffer=target_state_buffer,
                             state_indices=state_indices,
                             **variant['ebm_params'])

    # build the energy model
    elif (variant['ebm_params']['mode']) == 'ae':
        ebm_model = MLPAE(
            input_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
        )

        algorithm = EBMLearn(env=env,
                             training_env=training_env,
                             ebm=ebm_model,
                             input_dim=input_dim,
                             exploration_policy=policy,
                             sigma=None,
                             rescale=variant['rescale'],
                             target_state_buffer=target_state_buffer,
                             state_indices=state_indices,
                             **variant['ebm_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Ejemplo n.º 14
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # env = NormalizedBoxEnv(InvertedPendulumEnv())
    # ---------
    # env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))

    # env = ReacherEnv()
    # training_env = ReacherEnv()

    # env = NormalizedBoxEnv(ReacherEnv())
    # training_env = NormalizedBoxEnv(ReacherEnv())

    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env, _ = get_env(env_specs)
    training_env, _ = get_env(env_specs)

    print(env.observation_space)

    obs_space = env.observation_space
    if isinstance(env.observation_space, Dict):
        # possible keys: pixel, obs, obs_task_params
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(obs_space.spaces['obs'].shape))
        else:
            raise NotImplementedError()

        if variant['algo_params']['policy_uses_task_params']:
            if variant['algo_params']['concat_task_params_to_policy_obs']:
                obs_dim += int(
                    np.prod(obs_space.spaces['obs_task_params'].shape))
            else:
                raise NotImplementedError
    else:
        # OpenAI Gym Env or DMCS Env with only one obs
        obs_dim = int(np.prod(env.observation_space.shape))

    action_dim = int(np.prod(env.action_space.shape))

    # if variant['reload_policy_from'] != '':
    # params = joblib.load(variant['reload_policy_from'])
    # qf1, qf2, vf, policy = params['qf1'], params['qf2'], params['vf'], params['policy']
    # else:
    net_size = variant['net_size']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = NewSoftActorCritic(env=env,
                                   training_env=training_env,
                                   policy=policy,
                                   qf1=qf1,
                                   qf2=qf2,
                                   vf=vf,
                                   **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Ejemplo n.º 15
0
def experiment(specs):
    if not specs['use_scripted_policy']:
        policy_is_scripted = False
        policy = joblib.load(specs['expert_path'])['policy']
    else:
        policy_is_scripted = True
        policy = get_scripted_policy(specs['scripted_policy_name'])

    if specs['use_deterministic_expert']:
        policy = MakeDeterministic(policy)
    if ptu.gpu_enabled():
        policy.to(ptu.device)

    env = get_env(specs['env_specs'])
    env.seed(specs['env_specs']['env_seed'])

    # make the replay buffers
    max_path_length = specs['max_path_length']
    if 'wrap_absorbing' in specs and specs['wrap_absorbing']:
        """
        There was an intial implementation for this in v1.0
        in gen_irl_expert_trajs.py
        """
        raise NotImplementedError()
        _max_buffer_size = (max_path_length + 2) * specs['num_rollouts']
    else:
        _max_buffer_size = max_path_length * specs['num_rollouts']
    _max_buffer_size = int(
        np.ceil(_max_buffer_size / float(specs['subsample_factor'])))
    buffer_constructor = lambda: EnvReplayBuffer(
        _max_buffer_size,
        env,
    )

    train_buffer = buffer_constructor()
    test_buffer = buffer_constructor()

    render = specs['render']
    render_kwargs = specs['render_kwargs']
    check_for_success = specs['check_for_success']

    print('\n')
    # fill the train buffer
    fill_buffer(train_buffer,
                env,
                policy,
                specs['num_rollouts'],
                max_path_length,
                no_terminal=specs['no_terminal'],
                policy_is_scripted=policy_is_scripted,
                render=render,
                render_kwargs=render_kwargs,
                check_for_success=check_for_success,
                wrap_absorbing=False,
                subsample_factor=specs['subsample_factor'])

    # fill the test buffer
    fill_buffer(test_buffer,
                env,
                policy,
                specs['num_rollouts'],
                max_path_length,
                no_terminal=specs['no_terminal'],
                policy_is_scripted=policy_is_scripted,
                render=render,
                render_kwargs=render_kwargs,
                check_for_success=check_for_success,
                wrap_absorbing=False,
                subsample_factor=specs['subsample_factor'])

    # save the replay buffers
    logger.save_extra_data({
        'train': train_buffer,
        'test': test_buffer
    },
                           name='expert_demos.pkl')

    return 1
Ejemplo n.º 16
0
def experiment(variant):
    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-smm-implementation-' + variant['env_specs'][
            'task_name']
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        tmp = []
        ebm_id_dic = ebm_id_dics[variant['env_specs']['env_name'] + '_' +
                                 variant['env_specs']['task_name']]

        if str(variant['ebm_sigma']) in ebm_id_dic['sigma'].keys():
            ebm_id = ebm_id_dic['sigma'][str(variant['ebm_sigma'])]
            tmp = [_ for _ in ebm_id_dirs if ebm_id in _]
        else:
            raise NotImplementedError

        if len(tmp) > 0:
            ebm_id_dirs = tmp
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir
        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

        print("loaded EBM from {}".format(load_ebm_path))

    elif variant['ebil_params']['mode'] == 'ae':
        ebm_exp_name = 'ebm-ae-' + variant['env_specs'][
            'env_name'] + '-' + str(variant['expert_traj_num']) + '-train'
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir
        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

        print("loaded EBM from {}".format(load_ebm_path))

    else:
        raise NotImplementedError

    # Test
    if variant['test']:
        batch_data = target_state_buffer
        obs = torch.Tensor(batch_data[:100])
        exp_input = torch.cat([obs, acts], dim=1).to(ptu.device)
        print("Not expert data", ebm_model(exp_input * 200).mean().item())
        print("Expert data", ebm_model(exp_input).mean().item())
        exit(1)

    x = np.linspace(-1.25, 20, 1000)
    y = np.linspace(-1.25, 10, 1000)

    rewards = []
    for i in range(1000):
        data = []
        for j in range(1000):
            coords = np.array((x[j], y[i]))
            data.append((x[j], y[i]))

        data = np.array(data) / variant['rescale']
        data = torch.Tensor(data).to(ptu.device)
        reward = ebm_model(data).squeeze().detach().cpu().numpy()
        rewards.append(reward)
    #data = np.array(data)
    #data = torch.Tensor(data).to(ptu.device)

    rewards = np.array(rewards)
    print(rewards.shape)
    # rewards = np.reshape(rewards, (1000,1000))

    fig, ax = plt.subplots(figsize=(6, 6))
    # im = ax.imshow(rewards, cmap=plt.cm.hot_r)
    # plt.colorbar(im)
    h = plt.contourf(rewards, cmap=plt.cm.hot_r)
    cb = plt.colorbar(h)
    ax.set_xticks([58, 293, 528, 764, 999])
    ax.set_xticklabels(['0', '5', '10', '15', '20'])
    ax.set_yticks([111, 555, 999])
    ax.set_yticklabels(['0', '5', '10'])
    plt.savefig('./figs/' + variant['env_specs']['env_name'] + '_' +
                variant['env_specs']['task_name'] + '_' +
                str(variant['ebm_sigma']) + '_' + str(variant['ebm_epoch']) +
                '.pdf',
                bbox_inches='tight')

    return 1
Ejemplo n.º 17
0
from os import path as osp
import yaml
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import rlkit.torch.pytorch_util as ptu
from rlkit.envs import get_env
from rlkit.core import eval_util
from rlkit.samplers import PathSampler
from rlkit.torch.sac.policies import MakeDeterministic
from rlkit.envs.wrappers import ScaledEnv


env_specs = {'env_name': 'halfcheetah', 'env_kwargs': {}, 'eval_env_seed': 3562}
env = get_env(env_specs)
env.seed(env_specs['eval_env_seed'])
with open('expert_demos_listing.yaml', 'r') as f:
    listings = yaml.load(f.read())
    expert_demos_path = listings['norm_halfcheetah_32_demos_sub_20']['file_paths'][0]
    buffer_save_dict = joblib.load(expert_demos_path)
    env = ScaledEnv(
        env,
        obs_mean=buffer_save_dict['obs_mean'],
        obs_std=buffer_save_dict['obs_std'],
        acts_mean=buffer_save_dict['acts_mean'],
        acts_std=buffer_save_dict['acts_std'],
    )

bc_policy = joblib.load('/scratch/hdd001/home/kamyar/output/paper-version-hc-bc/paper_version_hc_bc_2019_05_19_00_32_05_0000--s-0/params.pkl')['exploration_policy']
bc_policy = MakeDeterministic(bc_policy)
Ejemplo n.º 18
0
def experiment(variant):
    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1
    
    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    net_size = variant['net_size']
    num_hidden = variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    
    trainer = SoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['sac_params']
    )
    algorithm = TorchRLAlgorithm(
        trainer=trainer,
        env=env,
        training_env=training_env,
        exploration_policy=policy,
        **variant['rl_alg_params']
    )

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Ejemplo n.º 19
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    if variant['disc_model_type'] == 'resnet_disc':
        disc_model = ResNetAIRLDisc(
            len(state_indices),
            num_layer_blocks=variant['disc_num_blocks'],
            hid_dim=variant['disc_hid_dim'],
            hid_act=variant['disc_hid_act'],
            use_bn=variant['disc_use_bn'],
            clamp_magnitude=variant['disc_clamp_magnitude'])
    else:
        disc_model = MLPDisc(len(state_indices),
                             num_layer_blocks=variant['disc_num_blocks'],
                             hid_dim=variant['disc_hid_dim'],
                             hid_act=variant['disc_hid_act'],
                             use_bn=variant['disc_use_bn'],
                             clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvSMM(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       target_state_buffer=target_state_buffer,
                       state_indices=state_indices,
                       **variant['adv_smm_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Ejemplo n.º 20
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    if 'minmax_env_with_demo_stats' in variant.keys():
        if variant['minmax_env_with_demo_stats']:
            print('Use minmax envs')
            assert 'norm_train' in buffer_save_dict.keys()
            expert_replay_buffer = buffer_save_dict['norm_train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
    elif variant['minmax_env_with_demo_stats']:
        env = MinmaxEnv(
            env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )
        training_env = MinmaxEnv(
            training_env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-' + variant['env_specs'][
            'env_name'] + '-' + str(
                variant['expert_traj_num']) + '-train--sigma-' + str(
                    variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir

        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    elif variant['ebil_params']['mode'] == 'ae':
        ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str(
            variant['expert_traj_num']) + '-train--sigma-' + str(
                variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir

        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    else:
        raise NotImplementedError

    print("loaded EBM from {}".format(load_ebm_path))

    # Test
    if variant['test']:
        batch_data = expert_replay_buffer.random_batch(
            100, keys=['observations', 'actions'])
        print('ebm_obs: ', np.mean(batch_data['observations'], axis=0))
        obs = torch.Tensor(batch_data['observations'])
        acts = torch.Tensor(batch_data['actions'])
        exp_input = torch.cat([obs, acts], dim=1).to(ptu.device)
        print("Not expert data", ebm_model(exp_input * 200).mean().item())
        print("Expert data", ebm_model(exp_input).mean().item())
        exit(1)

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm_pretrain = BC(env=env,
                            training_env=training_env,
                            exploration_policy=policy,
                            expert_replay_buffer=expert_replay_buffer,
                            **variant['bc_params'])
    algorithm = EBIL(env=env,
                     training_env=training_env,
                     exploration_policy=policy,
                     rew_func=variant['rew_func'],
                     cons=variant['cons'],
                     ebm=ebm_model,
                     policy_trainer=trainer,
                     expert_replay_buffer=expert_replay_buffer,
                     **variant['ebil_params'])

    if ptu.gpu_enabled():
        algorithm_pretrain.to(ptu.device)
        algorithm.to(ptu.device)
    else:
        algorithm_pretrain.to('cpu')
        algorithm.to('cpu')
    if variant['pretrain']:
        algorithm_pretrain.train()

    algorithm.train()

    return 1
Ejemplo n.º 21
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    disc_model = MLPDisc(
        obs_dim +
        action_dim if not variant['adv_irl_params']['state_only'] else 2 *
        obs_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvIRL(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       expert_replay_buffer=expert_replay_buffer,
                       **variant['adv_irl_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1