Example #1
0
def autoencode(*, tf_cfg, env_cfg):
    with utils.TfEnvContext(tf_cfg, env_cfg) as context:
        utils.logger.configure()
        vae = VariationalAutoEncoder(
            obs_shape=context.env_context.environments.observation_space.shape,
            d_classes=20,
            d_embedding=30,
            embedding_weight=0.01)
        LR = 1e-4

        tf.get_default_session().run(tf.local_variables_initializer())
        tf.get_default_session().run(tf.global_variables_initializer())

        buffer = deque(maxlen=500)

        env = context.env_context.environments
        env.reset()
        num_timesteps = 10000
        for i in range(num_timesteps):
            lr = LR * (num_timesteps - i) * 1.0 / num_timesteps
            obs = []
            for t in range(128):
                acts = [env.action_space.sample() for _ in range(env.num_envs)]
                obs.append(env.step(acts)[0])
            obs = np.array(obs).astype(np.uint8)
            obs[:, :, :10, :, :] = 87.0
            obs_batch = ppo2.sf01(obs)

            if i == 0:
                initial_obs = obs[:, 0, :, :, :]
                for n in range(500):
                    loss = vae.train_step(lr=2.5e-4, obs=initial_obs[:100])
                    if n % 100 == 0:
                        print(f"Initial burn in {n}/1000: {loss}")
                joblib.dump(
                    vae.compare(initial_obs[:120], disp_p=1),
                    osp.join(utils.logger.get_dir(), 'overfit_check.pkl'))

            if i % 100 == 0:
                joblib.dump(vae.compare(obs_batch),
                            osp.join(utils.logger.get_dir(), f'img_{i}.pkl'))
                #for epoch in range(4):
                #    for idx in np.random.permutation([i for i in range(len(buffer))]):
                #        vae.train_step(lr=lr, obs=buffer[idx])
                if i < 1000 or i % 1000 == 0:
                    vae.save(osp.join(utils.logger.get_dir(), f'vae_{i}.pkl'))

            buffer.append(obs_batch)
            utils.logger.logkv(
                'score',
                vae.train_step(lr=lr,
                               obs=buffer[np.random.randint(len(buffer))]))

            utils.logger.dumpkvs()
Example #2
0
    log_device_placement=False
)
tf_cfg.gpu_options.allow_growth = True

env_cfg = {
    'env_name': args.env,
    'n_envs': args.num_envs,
    'seed': args.seed,
    'one_hot_code': False
}

extra_args = {}
if args.encoder_type == 'non_pixel_class' and 'Pong' in args.env:
    extra_args['trim_score'] = True
    
with utils.TfEnvContext(tf_cfg, env_cfg) as context:
    utils.logger.configure()
    dirname = utils.logger.get_dir() 
    print(f"logging in {dirname}")
    vae = encoder(
        obs_shape=context.env_context.environments.observation_space.shape,
        d_embedding=40,
        d_classes=30,
        embedding_weight=0.001,
        **extra_args
    )
    LR = 1e-4
    
    def args(*, lr, obs, noise_scale=.2):
        return {
            'lr': lr,
Example #3
0
def train_airl(args):
    tf_cfg = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=args.n_cpu,
                            inter_op_parallelism_threads=args.n_cpu,
                            device_count={'GPU': 1},
                            log_device_placement=False)
    tf_cfg.gpu_options.allow_growth = True
    env_config = {
        'env_name': args.env,
        'n_envs': args.num_envs,
        'seed': args.seed,
        'one_hot_code': args.one_hot_code
    }
    with utils.TfEnvContext(tf_cfg, env_config) as context:
        ts = joblib.load(open(args.trajectories_file, 'rb'))
        training_kwargs, _, _, _ = irl.get_training_kwargs(
            venv=context.env_context.environments,
            training_cfg={
                'n_itr': args.n_iter,
                'batch_size': args.batch_size,
                'entropy_weight': args.entropy_wt
            },
            policy_cfg={
                'init_location':
                None if args.init_location == 'none' else args.init_location,
                'policy_model':
                CnnPolicy if args.policy_type == 'cnn' else MlpPolicy
            },
            reward_model_cfg={
                'expert_trajs': ts,
                'state_only': args.state_only,
                'drop_framestack': args.drop_discriminator_framestack,
                'only_show_scores': args.only_show_discriminator_scores,
                'reward_arch':
                cnn_net if args.policy_type == 'cnn' else relu_net,
                'value_fn_arch':
                cnn_net if args.policy_type == 'cnn' else relu_net
            },
            ablation=args.ablation)
        algo = irl.IRLRunner(
            **training_kwargs,
            sampler_cls=sampling.PPOBatchSampler,
        )

        def fill_trajectories(paths):
            algo.irl_model.eval_expert_probs(paths, algo.policy, insert=True)
            algo.irl_model._insert_next_state(paths)

        fill_trajectories(ts)
        for t in ts:
            del t['agent_infos']

        T = len(ts)
        ans = []
        keys = ('observations', 'observations_next', 'actions', 'actions_next',
                'a_logprobs')
        for key in keys:
            print(key)
            batch = []
            for i in range(T):
                batch.append(ts[i][key].copy())
                del ts[i][key]
            ans.append(np.concatenate(batch).astype(np.float32))
            for i in reversed(range(len(batch))):
                del batch[i]
            del batch
        joblib.dump(ans, open(args.cache_path, 'wb'))