def autoencode(*, tf_cfg, env_cfg): with utils.TfEnvContext(tf_cfg, env_cfg) as context: utils.logger.configure() vae = VariationalAutoEncoder( obs_shape=context.env_context.environments.observation_space.shape, d_classes=20, d_embedding=30, embedding_weight=0.01) LR = 1e-4 tf.get_default_session().run(tf.local_variables_initializer()) tf.get_default_session().run(tf.global_variables_initializer()) buffer = deque(maxlen=500) env = context.env_context.environments env.reset() num_timesteps = 10000 for i in range(num_timesteps): lr = LR * (num_timesteps - i) * 1.0 / num_timesteps obs = [] for t in range(128): acts = [env.action_space.sample() for _ in range(env.num_envs)] obs.append(env.step(acts)[0]) obs = np.array(obs).astype(np.uint8) obs[:, :, :10, :, :] = 87.0 obs_batch = ppo2.sf01(obs) if i == 0: initial_obs = obs[:, 0, :, :, :] for n in range(500): loss = vae.train_step(lr=2.5e-4, obs=initial_obs[:100]) if n % 100 == 0: print(f"Initial burn in {n}/1000: {loss}") joblib.dump( vae.compare(initial_obs[:120], disp_p=1), osp.join(utils.logger.get_dir(), 'overfit_check.pkl')) if i % 100 == 0: joblib.dump(vae.compare(obs_batch), osp.join(utils.logger.get_dir(), f'img_{i}.pkl')) #for epoch in range(4): # for idx in np.random.permutation([i for i in range(len(buffer))]): # vae.train_step(lr=lr, obs=buffer[idx]) if i < 1000 or i % 1000 == 0: vae.save(osp.join(utils.logger.get_dir(), f'vae_{i}.pkl')) buffer.append(obs_batch) utils.logger.logkv( 'score', vae.train_step(lr=lr, obs=buffer[np.random.randint(len(buffer))])) utils.logger.dumpkvs()
log_device_placement=False ) tf_cfg.gpu_options.allow_growth = True env_cfg = { 'env_name': args.env, 'n_envs': args.num_envs, 'seed': args.seed, 'one_hot_code': False } extra_args = {} if args.encoder_type == 'non_pixel_class' and 'Pong' in args.env: extra_args['trim_score'] = True with utils.TfEnvContext(tf_cfg, env_cfg) as context: utils.logger.configure() dirname = utils.logger.get_dir() print(f"logging in {dirname}") vae = encoder( obs_shape=context.env_context.environments.observation_space.shape, d_embedding=40, d_classes=30, embedding_weight=0.001, **extra_args ) LR = 1e-4 def args(*, lr, obs, noise_scale=.2): return { 'lr': lr,
def train_airl(args): tf_cfg = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=args.n_cpu, inter_op_parallelism_threads=args.n_cpu, device_count={'GPU': 1}, log_device_placement=False) tf_cfg.gpu_options.allow_growth = True env_config = { 'env_name': args.env, 'n_envs': args.num_envs, 'seed': args.seed, 'one_hot_code': args.one_hot_code } with utils.TfEnvContext(tf_cfg, env_config) as context: ts = joblib.load(open(args.trajectories_file, 'rb')) training_kwargs, _, _, _ = irl.get_training_kwargs( venv=context.env_context.environments, training_cfg={ 'n_itr': args.n_iter, 'batch_size': args.batch_size, 'entropy_weight': args.entropy_wt }, policy_cfg={ 'init_location': None if args.init_location == 'none' else args.init_location, 'policy_model': CnnPolicy if args.policy_type == 'cnn' else MlpPolicy }, reward_model_cfg={ 'expert_trajs': ts, 'state_only': args.state_only, 'drop_framestack': args.drop_discriminator_framestack, 'only_show_scores': args.only_show_discriminator_scores, 'reward_arch': cnn_net if args.policy_type == 'cnn' else relu_net, 'value_fn_arch': cnn_net if args.policy_type == 'cnn' else relu_net }, ablation=args.ablation) algo = irl.IRLRunner( **training_kwargs, sampler_cls=sampling.PPOBatchSampler, ) def fill_trajectories(paths): algo.irl_model.eval_expert_probs(paths, algo.policy, insert=True) algo.irl_model._insert_next_state(paths) fill_trajectories(ts) for t in ts: del t['agent_infos'] T = len(ts) ans = [] keys = ('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs') for key in keys: print(key) batch = [] for i in range(T): batch.append(ts[i][key].copy()) del ts[i][key] ans.append(np.concatenate(batch).astype(np.float32)) for i in reversed(range(len(batch))): del batch[i] del batch joblib.dump(ans, open(args.cache_path, 'wb'))