def experiment(variant):
    task_mode = variant['task_mode'] # train, test, eval
    task_idx = variant['task_idx']

    if task_mode == 'train':
        task_sampler = WalkerTrainParamsSampler()
    elif task_mode == 'test':
        task_sampler = WalkerTestParamsSampler()
    else:
        raise NotImplementedError()
    task_params = task_sampler.get_task(task_idx)
    obs_task_params = task_sampler.get_obs_task_params(task_params)
    env = SingleTaskWalkerEnv(task_params, obs_task_params)
    training_env = SingleTaskWalkerEnv(task_params, obs_task_params)

    print(env.observation_space)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    hidden_sizes = [net_size] * variant['num_hidden_layers']
    print('Using simple model')
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = NewSoftActorCritic(
        env=env,
        training_env=training_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['algo_params']
    )
    
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #2
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning airl
    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # load the expert
    expert_policy = joblib.load(variant['expert_policy'])['algorithm']
    expert_policy.replay_buffer = None

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        meta_train_env = ScaledMetaEnv(
            meta_train_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        meta_test_env = ScaledMetaEnv(
            meta_test_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
    print(meta_train_env)
    print(meta_test_env)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']

    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )

    # Make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim + action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg'])

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = MetaDagger(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        expert_policy,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        encoder,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params'])

    for task_id in train_context_buffer.task_replay_buffers:
        erb = train_context_buffer.task_replay_buffers[task_id]
        rb = algorithm.replay_buffer.task_replay_buffers[task_id]
        erb_size = erb._size
        print(erb_size)
        for k in erb._observations:
            rb._observations[k][:erb_size] = erb._observations[k][:erb_size]
            rb._next_obs[k][:erb_size] = erb._next_obs[k][:erb_size]
        rb._actions[:erb_size] = erb._actions[:erb_size]
        rb._rewards[:erb_size] = erb._rewards[:erb_size]
        rb._terminals[:erb_size] = erb._terminals[:erb_size]
        rb._absorbing[:erb_size] = erb._absorbing[:erb_size]
        rb._size = erb_size
        rb._top = erb_size

    # print('\n\n')
    # for task_id in algorithm.replay_buffer.task_replay_buffers:
    #     rb = algorithm.replay_buffer.task_replay_buffers[task_id]
    #     print(rb._size)
    #     print(rb._top)
    #     print(rb._max_replay_buffer_size)

    if ptu.gpu_enabled():
        expert_policy.cuda()
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #3
0
ファイル: old_meta_sac.py プロジェクト: yifan-you-37/rl_swiss
def experiment(variant):
    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env_specs_vg = VariantGenerator()
    env_spec_constants = {}
    env_spec_ranges = {}
    for k, v in env_specs.items():
        if isinstance(v, list):
            env_specs_vg.add(k, v)
            env_spec_ranges[k] = v
        else:
            env_spec_constants[k] = v

    env_specs_list = []
    for es in env_specs_vg.variants():
        del es['_hidden_keys']
        es.update(env_spec_constants)
        env_specs_list.append(es)

    env_sampler = EnvSampler(env_specs_list)

    # make the normalizer function for the env_params
    mean = []
    half_diff = []
    for k in sorted(env_spec_ranges.keys()):
        r = env_spec_ranges[k]
        if len(r) == 1:
            mean.append(0)
            half_diff.append(r[0])
        else:
            mean.append((r[0] + r[1]) / 2.0)
            half_diff.append((r[1] - r[0]) / 2.0)
    mean = np.array(mean)
    half_diff = np.array(half_diff)

    def env_params_normalizer(params):
        return (params - mean) / half_diff

    variant['algo_params']['env_params_normalizer'] = env_params_normalizer

    # set up similar to non-meta version
    sample_env, _ = env_sampler()
    if variant['algo_params']['concat_env_params_to_obs']:
        meta_params_dim = sample_env.env_meta_params.shape[0]
    else:
        meta_params_dim = 0
    obs_dim = int(np.prod(sample_env.observation_space.shape))
    action_dim = int(np.prod(sample_env.action_space.shape))

    net_size = variant['net_size']
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + meta_params_dim,
        output_size=1,
    )
    if exp_specs['use_new_sac']:
        qf1 = FlattenMlp(
            hidden_sizes=[net_size, net_size],
            input_size=obs_dim + action_dim + meta_params_dim,
            output_size=1,
        )
        qf2 = FlattenMlp(
            hidden_sizes=[net_size, net_size],
            input_size=obs_dim + action_dim + meta_params_dim,
            output_size=1,
        )
        policy = ReparamTanhMultivariateGaussianPolicy(
            hidden_sizes=[net_size, net_size],
            obs_dim=obs_dim + meta_params_dim,
            action_dim=action_dim,
        )
        algorithm = NewMetaSoftActorCritic(env_sampler=env_sampler,
                                           policy=policy,
                                           qf1=qf1,
                                           qf2=qf2,
                                           vf=vf,
                                           **variant['algo_params'])
    else:
        policy = TanhGaussianPolicy(
            hidden_sizes=[net_size, net_size],
            obs_dim=obs_dim + meta_params_dim,
            action_dim=action_dim,
        )
        qf = FlattenMlp(
            hidden_sizes=[net_size, net_size],
            input_size=obs_dim + action_dim + meta_params_dim,
            output_size=1,
        )
        algorithm = MetaSoftActorCritic(env_sampler=env_sampler,
                                        policy=policy,
                                        qf=qf,
                                        vf=vf,
                                        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #4
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = BC(env=env,
                   training_env=training_env,
                   exploration_policy=policy,
                   expert_replay_buffer=expert_replay_buffer,
                   **variant['bc_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
コード例 #5
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning airl
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        meta_train_env = ScaledMetaEnv(
            meta_train_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        meta_test_env = ScaledMetaEnv(
            meta_test_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
    print(meta_train_env)
    print(meta_test_env)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError('Not implemented pixel version of things!')
        else:
            obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']

    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )

    # Make the encoder
    encoder = TimestepBasedEncoder(
        2*obs_dim + action_dim, #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg']
    )
    # ---------------
    # encoder = WeightShareTimestepBasedEncoder(
    #     obs_dim,
    #     action_dim,
    #     64,
    #     variant['algo_params']['r_dim'],
    #     variant['algo_params']['z_dim'],
    #     variant['algo_params']['enc_hid_dim'],
    #     variant['algo_params']['r2z_hid_dim'],
    #     variant['algo_params']['num_enc_layer_blocks'],
    #     hid_act='relu',
    #     use_bn=True,
    #     within_traj_agg=variant['algo_params']['within_traj_agg']
    # )
    # ---------------
    # traj_enc = ConvTrajEncoder(
    #     variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'],
    #     # obs_dim + action_dim,
    #     obs_dim + action_dim + obs_dim,
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['traj_enc_params']['kernel'],
    #     variant['algo_params']['np_params']['traj_enc_params']['stride'],
    # )
    # Dc2R_map = Dc2RMap(
    #     variant['algo_params']['np_params']['Dc2r_params']['agg_type'],
    #     traj_enc,
    #     state_only=False
    # )
    # r2z_map = R2ZMap(
    #     variant['algo_params']['np_params']['r2z_params']['num_layers'],
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['r2z_params']['hid_dim'],
    #     variant['algo_params']['z_dim']
    # )
    # encoder = NPEncoder(
    #     Dc2R_map,
    #     r2z_map,
    # )

    
    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)

    algorithm = NeuralProcessBC(
        meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup)
        
        policy,

        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        encoder,

        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,

        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #6
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    if 'minmax_env_with_demo_stats' in variant.keys():
        if variant['minmax_env_with_demo_stats']:
            print('Use minmax envs')
            assert 'norm_train' in buffer_save_dict.keys()
            expert_replay_buffer = buffer_save_dict['norm_train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))
    
    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
    elif variant['minmax_env_with_demo_stats']:
        env = MinmaxEnv(
            env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )
        training_env = MinmaxEnv(
            training_env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )


    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1
    
    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    
    input_dim = obs_dim + action_dim if not variant['ebm_params']['state_only'] else 2*obs_dim

    # build the energy model
    if (variant['ebm_params']['mode']) == 'deen':
        ebm_model = MLPEBM(
            input_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude']
        )

        algorithm = EBMLearn(
            env=env,
            training_env=training_env,
            ebm=ebm_model,
            input_dim = input_dim,
            exploration_policy=policy,
            sigma=variant['sigma'],

            expert_replay_buffer=expert_replay_buffer,
            **variant['ebm_params']
        )
    
    # build the energy model
    elif (variant['ebm_params']['mode']) == 'ae':
        ebm_model = MLPAE(
            input_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
        )

        algorithm = EBMLearn(
            env=env,
            training_env=training_env,
            ebm=ebm_model,
            input_dim = input_dim,
            exploration_policy=policy,
            sigma=None,

            expert_replay_buffer=expert_replay_buffer,
            **variant['ebm_params']
        )

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
コード例 #7
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    if 'minmax_env_with_demo_stats' in variant.keys():
        if variant['minmax_env_with_demo_stats']:
            print('Use minmax envs')
            assert 'norm_train' in buffer_save_dict.keys()
            expert_replay_buffer = buffer_save_dict['norm_train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
    elif variant['minmax_env_with_demo_stats']:
        env = MinmaxEnv(
            env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )
        training_env = MinmaxEnv(
            training_env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-' + variant['env_specs'][
            'env_name'] + '-' + str(
                variant['expert_traj_num']) + '-train--sigma-' + str(
                    variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir

        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    elif variant['ebil_params']['mode'] == 'ae':
        ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str(
            variant['expert_traj_num']) + '-train--sigma-' + str(
                variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir

        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    else:
        raise NotImplementedError

    print("loaded EBM from {}".format(load_ebm_path))

    # Test
    if variant['test']:
        batch_data = expert_replay_buffer.random_batch(
            100, keys=['observations', 'actions'])
        print('ebm_obs: ', np.mean(batch_data['observations'], axis=0))
        obs = torch.Tensor(batch_data['observations'])
        acts = torch.Tensor(batch_data['actions'])
        exp_input = torch.cat([obs, acts], dim=1).to(ptu.device)
        print("Not expert data", ebm_model(exp_input * 200).mean().item())
        print("Expert data", ebm_model(exp_input).mean().item())
        exit(1)

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm_pretrain = BC(env=env,
                            training_env=training_env,
                            exploration_policy=policy,
                            expert_replay_buffer=expert_replay_buffer,
                            **variant['bc_params'])
    algorithm = EBIL(env=env,
                     training_env=training_env,
                     exploration_policy=policy,
                     rew_func=variant['rew_func'],
                     cons=variant['cons'],
                     ebm=ebm_model,
                     policy_trainer=trainer,
                     expert_replay_buffer=expert_replay_buffer,
                     **variant['ebil_params'])

    if ptu.gpu_enabled():
        algorithm_pretrain.to(ptu.device)
        algorithm.to(ptu.device)
    else:
        algorithm_pretrain.to('cpu')
        algorithm.to('cpu')
    if variant['pretrain']:
        algorithm_pretrain.train()

    algorithm.train()

    return 1
コード例 #8
0
def experiment(variant):
    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1
    
    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    net_size = variant['net_size']
    num_hidden = variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    
    trainer = SoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['sac_params']
    )
    algorithm = TorchRLAlgorithm(
        trainer=trainer,
        env=env,
        training_env=training_env,
        exploration_policy=policy,
        **variant['rl_alg_params']
    )

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
コード例 #9
0
def experiment(variant):
    # get the expert data
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)
    expert_buffer = extra_data['train']

    # set up the env
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        env = ScaledEnv(
            env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # seed the env
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])
    print(env.observation_space)

    # compute obs_dim and action_dim
    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                if variant['algo_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(
                        np.prod(env.observation_space.
                                spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    print(obs_dim, action_dim)

    sleep(3)

    # set up the policy models
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers']

    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
        batch_norm=variant['policy_uses_bn'],
        layer_norm=variant['policy_uses_layer_norm'])
    # policy = MlpPolicy(
    #     hidden_sizes=hidden_sizes,
    #     obs_dim=obs_dim,
    #     action_dim=action_dim,
    #     batch_norm=variant['policy_uses_bn'],
    #     layer_norm=variant['policy_uses_layer_norm']
    # )

    # set up the AIRL algorithm
    algorithm = BC(env,
                   policy,
                   expert_buffer,
                   training_env=training_env,
                   wrap_absorbing=variant['wrap_absorbing_state'],
                   **variant['algo_params'])
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #10
0
def experiment(variant):
    env_specs = variant['env_specs']
    if variant['algo_params']['meta']:
        env, training_env = get_meta_env(env_specs)
    else:
        if env_specs['train_test_env']:
            env, training_env = get_env(env_specs)
        else:
            env, _ = get_env(env_specs)
            training_env, _ = get_env(env_specs)

    if variant['algo_params']['meta']:
        train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
            env_specs)

    print(env.observation_space)

    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                obs_dim += int(
                    np.prod(
                        env.observation_space.spaces['obs_task_params'].shape))
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    hidden_sizes = [net_size] * variant['num_hidden_layers']
    if variant['use_custom_ant_models']:
        assert isinstance(env.observation_space, Dict)
        print('CUSTOM ANT WITH LINEAR EMBEDDING OF THE TARGET POSITION')
        qf1 = AntRandGoalCustomQFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape))
            + action_dim,
            output_size=1,
        )
        qf2 = AntRandGoalCustomQFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape))
            + action_dim,
            output_size=1,
        )
        vf = AntRandGoalCustomVFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape)),
            output_size=1,
        )
        policy = AntRandGoalCustomReparamTanhMultivariateGaussianPolicy(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            obs_dim=int(np.prod(env.observation_space.spaces['obs'].shape)),
            action_dim=action_dim,
        )

        # CUSTOM ANT WITH GATING ACTIVATIONS OF EACH LAYER
        # qf1 = AntCustomGatingQFuncV1()
        # qf2 = AntCustomGatingQFuncV1()
        # vf = AntCustomGatingVFuncV1()
        # policy = AntCustomGatingV1ReparamTanhMultivariateGaussianPolicy()
    else:
        print('Using simple model')
        qf1 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        qf2 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        vf = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim,
            output_size=1,
        )
        policy = ReparamTanhMultivariateGaussianPolicy(
            hidden_sizes=hidden_sizes,
            obs_dim=obs_dim,
            action_dim=action_dim,
        )

    if variant['algo_params']['meta']:
        algorithm = MetaNewSoftActorCritic(
            env=env,
            training_env=training_env,
            policy=policy,
            qf1=qf1,
            qf2=qf2,
            vf=vf,
            train_task_params_sampler=train_task_params_sampler,
            test_task_params_sampler=test_task_params_sampler,
            true_env_obs_dim=int(
                np.prod(env.observation_space.spaces['obs'].shape)),
            **variant['algo_params'])
    else:
        algorithm = NewSoftActorCritic(env=env,
                                       training_env=training_env,
                                       policy=policy,
                                       qf1=qf1,
                                       qf2=qf2,
                                       vf=vf,
                                       **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #11
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    target_state_buffer /= variant['rescale']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    input_dim = len(state_indices)

    # build the energy model
    if (variant['ebm_params']['mode']) == 'deen':
        ebm_model = MLPEBM(input_dim,
                           num_layer_blocks=variant['ebm_num_blocks'],
                           hid_dim=variant['ebm_hid_dim'],
                           hid_act=variant['ebm_hid_act'],
                           use_bn=variant['ebm_use_bn'],
                           clamp_magnitude=variant['ebm_clamp_magnitude'])

        algorithm = EBMLearn(env=env,
                             training_env=training_env,
                             ebm=ebm_model,
                             input_dim=input_dim,
                             exploration_policy=policy,
                             sigma=variant['sigma'],
                             target_state_buffer=target_state_buffer,
                             state_indices=state_indices,
                             **variant['ebm_params'])

    # build the energy model
    elif (variant['ebm_params']['mode']) == 'ae':
        ebm_model = MLPAE(
            input_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
        )

        algorithm = EBMLearn(env=env,
                             training_env=training_env,
                             ebm=ebm_model,
                             input_dim=input_dim,
                             exploration_policy=policy,
                             sigma=None,
                             rescale=variant['rescale'],
                             target_state_buffer=target_state_buffer,
                             state_indices=state_indices,
                             **variant['ebm_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
コード例 #12
0
ファイル: meta_sac.py プロジェクト: yifan-you-37/rl_swiss
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # env = NormalizedBoxEnv(InvertedPendulumEnv())
    # ---------
    # env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))

    # env = ReacherEnv()
    # training_env = ReacherEnv()

    # env = NormalizedBoxEnv(ReacherEnv())
    # training_env = NormalizedBoxEnv(ReacherEnv())

    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env, _ = get_env(env_specs)
    training_env, _ = get_env(env_specs)

    print(env.observation_space)

    obs_space = env.observation_space
    if isinstance(env.observation_space, Dict):
        # possible keys: pixel, obs, obs_task_params
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(obs_space.spaces['obs'].shape))
        else:
            raise NotImplementedError()

        if variant['algo_params']['policy_uses_task_params']:
            if variant['algo_params']['concat_task_params_to_policy_obs']:
                obs_dim += int(
                    np.prod(obs_space.spaces['obs_task_params'].shape))
            else:
                raise NotImplementedError
    else:
        # OpenAI Gym Env or DMCS Env with only one obs
        obs_dim = int(np.prod(env.observation_space.shape))

    action_dim = int(np.prod(env.action_space.shape))

    # if variant['reload_policy_from'] != '':
    # params = joblib.load(variant['reload_policy_from'])
    # qf1, qf2, vf, policy = params['qf1'], params['qf2'], params['vf'], params['policy']
    # else:
    net_size = variant['net_size']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = NewSoftActorCritic(env=env,
                                   training_env=training_env,
                                   policy=policy,
                                   qf1=qf1,
                                   qf2=qf2,
                                   vf=vf,
                                   **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #13
0
def experiment(variant):
    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )

    # make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim + action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg'],
        state_only=False)

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = PEARL(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        qf1,
        qf2,
        vf,
        encoder,
        # z_dim,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #14
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning AIRL
    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    if variant['algo_params']['state_only']:
        print('\n\nUSING STATE ONLY DISC\n\n')
    disc_model = StandardMetaDisc(
        2 * obs_dim + action_dim + variant['algo_params']['z_dim']
        if not variant['algo_params']['state_only'] else 2 * obs_dim +
        variant['algo_params']['z_dim'],
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)
    if variant['algo_params']['use_target_disc']:
        target_disc = disc_model.copy()
    else:
        target_disc = None
    print(disc_model)
    print(disc_model.clamp_magnitude)

    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )
    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'],
        **variant['policy_params'])

    # make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim + action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
    )

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = MetaFAIRL(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        disc_model,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        encoder,
        policy_optimizer,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        target_disc=target_disc,
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #15
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    print(demos_path)
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    # target_state_buffer /= variant['rescale']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-' + variant['env_specs'][
            'env_name'] + '-' + str(
                variant['expert_traj_num']) + '-train--sigma-' + str(
                    variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        load_ebm_dir = ebm_dir
        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    else:
        raise NotImplementedError

    # Test
    if variant['test']:
        batch_data = target_state_buffer / variant['rescale']
        obs = torch.Tensor(batch_data[:1000]).to(ptu.device)
        print("Not expert data", ebm_model(obs * 200).mean().item())
        print("Expert data", ebm_model(obs).mean().item())
        exit(1)

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = EBIL(env=env,
                     training_env=training_env,
                     exploration_policy=policy,
                     rew_func=variant['rew_func'],
                     cons=variant['cons'],
                     rescale=variant['rescale'],
                     ebm=ebm_model,
                     policy_trainer=trainer,
                     target_state_buffer=target_state_buffer,
                     state_indices=state_indices,
                     **variant['ebil_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
コード例 #16
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning GAIL
    expert_buffer = extra_data['train']

    # set up the env
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])
    print(env.observation_space)

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        env = ScaledEnv(
            env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # compute obs_dim and action_dim
    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                if variant['algo_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(
                        np.prod(env.observation_space.
                                spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    print(obs_dim, action_dim)

    sleep(3)

    # set up the policy models
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers']
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # set up the discriminator models
    disc_model = StandardAIRLDisc(
        obs_dim + action_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)

    # set up the AdvBC algorithm
    algorithm = AdvBC(env,
                      policy,
                      disc_model,
                      expert_buffer,
                      training_env=training_env,
                      wrap_absorbing=variant['wrap_absorbing_state'],
                      **variant['algo_params'])
    print(algorithm.use_target_disc)
    print(algorithm.soft_target_disc_tau)
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)
    print(algorithm.disc_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.defaults['lr'])

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #17
0
def experiment(variant):
    print('RUNNING')
    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env_sampler = OnTheFlyEnvSampler(env_specs)

    # Build the normalizer for the env params
    env_spec_ranges = {}
    for k, v in env_specs.items():
        if isinstance(v, list):
            env_spec_ranges[k] = v

    mean = []
    half_diff = []
    for k in sorted(env_spec_ranges.keys()):
        r = env_spec_ranges[k]
        mean.append((r[0] + r[1]) / 2.0)
        half_diff.append((r[1] - r[0]) / 2.0)
    mean = np.array(mean)
    half_diff = np.array(half_diff)

    def env_params_normalizer(params):
        return (params - mean) / half_diff

    variant['algo_params']['env_params_normalizer'] = env_params_normalizer

    # set up similar to non-meta version
    sample_env, _ = env_sampler()
    if variant['algo_params']['concat_env_params_to_obs']:
        meta_params_dim = sample_env.env_meta_params.shape[0]
    else:
        meta_params_dim = 0
    obs_dim = int(np.prod(sample_env.observation_space.shape))
    action_dim = int(np.prod(sample_env.action_space.shape))

    net_size = variant['net_size']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + meta_params_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + meta_params_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + meta_params_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + meta_params_dim,
        action_dim=action_dim,
    )
    algorithm = NewMetaSoftActorCritic(env_sampler=env_sampler,
                                       policy=policy,
                                       qf1=qf1,
                                       qf2=qf2,
                                       vf=vf,
                                       **variant['algo_params'])
    # assert False, "Have not added new sac yet!"
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    expert_buffer = joblib.load(variant['xy_data_path'])['xy_data']

    # set up the env
    env_specs = variant['env_specs']
    if env_specs['train_test_env']:
        env, training_env = get_env(env_specs)
    else:
        env, _ = get_env(env_specs)
        training_env, _ = get_env(env_specs)
    env.seed(variant['seed'])
    training_env.seed(variant['seed'])
    print(env.observation_space)

    if variant['scale_env_with_given_demo_stats']:
        assert False
        assert not env_specs['normalized']
        env = ScaledEnv(
            env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # compute obs_dim and action_dim
    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                if variant['algo_params']['concat_task_params_to_policy_obs']:
                    obs_dim += int(
                        np.prod(env.observation_space.
                                spaces['obs_task_params'].shape))
                else:
                    raise NotImplementedError()
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    print(obs_dim, action_dim)

    sleep(3)

    # set up the policy models
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        # policy = ReparamMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
        # std=0.1
    )

    # set up the discriminator models
    disc_model_class = ThreeWayResNetAIRLDisc if variant[
        'threeway'] else ResNetAIRLDisc
    disc_model = disc_model_class(
        2,  # obs is just x-y pos
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)

    # set up the RL algorithm used to train the policy
    policy_optimizer = EntConstSAC(policy=policy,
                                   qf1=qf1,
                                   qf2=qf2,
                                   target_qf1=target_qf1,
                                   target_qf2=target_qf2,
                                   action_dim=action_dim,
                                   **variant['policy_params'])

    # set up the AIRL algorithm
    alg_class = ThreewayStateMarginalMatchingAlg if variant[
        'threeway'] else StateMarginalMatchingAlg
    algorithm = alg_class(env,
                          policy,
                          disc_model,
                          policy_optimizer,
                          expert_buffer,
                          training_env=training_env,
                          **variant['algo_params'])
    print(algorithm.exploration_policy)
    print(algorithm.eval_policy)
    print(algorithm.policy_optimizer.policy_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr'])
    print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr'])
    print(algorithm.disc_optimizer.defaults['lr'])

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #19
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    if variant['disc_model_type'] == 'resnet_disc':
        disc_model = ResNetAIRLDisc(
            len(state_indices),
            num_layer_blocks=variant['disc_num_blocks'],
            hid_dim=variant['disc_hid_dim'],
            hid_act=variant['disc_hid_act'],
            use_bn=variant['disc_use_bn'],
            clamp_magnitude=variant['disc_clamp_magnitude'])
    else:
        disc_model = MLPDisc(len(state_indices),
                             num_layer_blocks=variant['disc_num_blocks'],
                             hid_dim=variant['disc_hid_dim'],
                             hid_act=variant['disc_hid_act'],
                             use_bn=variant['disc_use_bn'],
                             clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvSMM(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       target_state_buffer=target_state_buffer,
                       state_indices=state_indices,
                       **variant['adv_smm_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
コード例 #20
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning GAIL
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError('Not implemented pixel version of things!')
        else:
            obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the policy and policy optimizer
    hidden_sizes = [variant['algo_params']['policy_net_size']] * variant['algo_params']['policy_num_layers']
    z_dim = variant['algo_params']['np_params']['z_dim']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )
    
    # disc_model = GAILDiscModel(obs_dim + action_dim + z_dim, hid_dim=variant['algo_params']['disc_net_size'])
    disc_model = MlpGAILDisc(
        hidden_sizes=variant['disc_hidden_sizes'],
        output_size=1,
        input_size=obs_dim + action_dim + z_dim,
        hidden_activation=torch.nn.functional.tanh,
        layer_norm=variant['disc_uses_layer_norm']
        # output_activation=identity,
    )

    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['algo_params']['policy_params']
    )

    # Make the neural process
    # in the initial version we are assuming all trajectories have the same length
    timestep_enc_params = variant['algo_params']['np_params']['traj_enc_params']['timestep_enc_params']
    traj_enc_params = variant['algo_params']['np_params']['traj_enc_params']['traj_enc_params']
    timestep_enc_params['input_size'] = obs_dim + action_dim
    
    traj_samples, _ = train_context_buffer.sample_trajs(1, num_tasks=1)
    # len_context_traj = traj_samples[0][0]['observations'].shape[0]
    len_context_traj = 5
    traj_enc_params['input_size'] = timestep_enc_params['output_size'] * len_context_traj

    traj_enc = TrivialTrajEncoder(
        timestep_enc_params,
        traj_enc_params
    )

    trunk_params = variant['algo_params']['np_params']['r2z_map_params']['trunk_params']
    trunk_params['input_size'] = traj_enc.output_size
    
    split_params = variant['algo_params']['np_params']['r2z_map_params']['split_heads_params']
    split_params['input_size'] = trunk_params['output_size']
    split_params['output_size'] = variant['algo_params']['np_params']['z_dim']
    
    r2z_map = TrivialR2ZMap(
        trunk_params,
        split_params
    )
    
    np_enc = TrivialNPEncoder(
        variant['algo_params']['np_params']['np_enc_params']['agg_type'],
        traj_enc,
        r2z_map
    )
    
    # class StupidDistFormat():
    #     def __init__(self, var):
    #         self.mean = var
    # class ZeroModule(nn.Module):
    #     def __init__(self, z_dim):
    #         super().__init__()
    #         self.z_dim = z_dim
    #         self.fc = nn.Linear(10,10)
        
    #     def forward(self, context):
    #         c_len = len(context)
    #         return StupidDistFormat(Variable(torch.zeros(c_len, self.z_dim), requires_grad=False))
    # np_enc = ZeroModule(variant['algo_params']['np_params']['z_dim'])


    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)
    algorithm = NeuralProcessAIRL(
        meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup)
        
        policy,
        disc_model,

        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        np_enc,

        policy_optimizer,

        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
コード例 #21
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    disc_model = MLPDisc(
        obs_dim +
        action_dim if not variant['adv_irl_params']['state_only'] else 2 *
        obs_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvIRL(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       expert_replay_buffer=expert_replay_buffer,
                       **variant['adv_irl_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1