def run_linear_ocm_exp(variant): from rlkit.tf.ddpg import DDPG from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented ) from rlkit.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] algo_params = variant['algo_params'] set_seed(seed) onehot_dim = num_values + 1 env_action_dim = num_values + 1 """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) # env = FlattenedProductBox(env) # qf = FeedForwardCritic( # name_or_scope="critic", # env_spec=env.spec, # ) qf = MlpMemoryQFunction( name_or_scope="critic", env_spec=env.spec, ) policy = ActionAwareMemoryPolicy( name_or_scope="noisy_policy", action_dim=env_action_dim, memory_dim=memory_dim, env_spec=env.spec, ) es = OUStrategy(env_spec=env.spec) algorithm = DDPG( env, es, policy, qf, **algo_params ) algorithm.train()
def main(): # noinspection PyTypeChecker set_seed(args.seed) variant = dict( algo_params=dict( num_epochs=1000, num_steps_per_epoch=args.steps_per_epoch, num_steps_per_eval=1000, batch_size=args.batch_size, max_path_length=999, discount=0.99, reward_scale=args.reward_scale, soft_target_tau=0.001, policy_lr=3E-4, qf_lr=3E-4, vf_lr=3E-4, collection_mode=args.train_mode, num_updates_per_epoch=args.updates_per_epoch, num_threads=args.num_threads, ), net_size=args.net_size, ) setup_logger(args.env_name, variant=variant, exp_id=args.exp_name, seed=args.seed) ptu.set_gpu_mode(not args.cpu, gpu_id=args.gpu_id) experiment(variant)
def main(): n_seeds = 1 mode = "here" exp_prefix = "dev-sl" # n_seeds = 10 # mode = "ec2" exp_prefix = "paper-6-14-HL-sl-H25" H = 25 # noinspection PyTypeChecker variant = dict( H=H, exp_prefix=exp_prefix, algo_params=dict( num_batches_per_epoch=100, num_epochs=30, learning_rate=1e-3, batch_size=1000, eval_num_episodes=64, lstm_state_size=10, # rnn_cell_class=LSTMCell, # rnn_cell_params=dict( # use_peepholes=True, # ), rnn_cell_class=SeparateLstmLinearCell, rnn_cell_params=dict( use_peepholes=True, env_noise_std=0, memory_noise_std=0, output_nonlinearity=tf.nn.tanh, # output_nonlinearity=tf.nn.softmax, env_hidden_sizes=[], output_dim=1, ), softmax=False, ), version='Supervised Learning', env_class=HighLow, env_params=dict(horizon=H, ) # env_class=OneCharMemory, ) exp_id = -1 for _ in range(n_seeds): seed = random.randint(0, 999999) exp_id += 1 set_seed(seed) variant['seed'] = seed variant['exp_id'] = exp_id run_experiment( bptt_launcher, exp_prefix=exp_prefix, seed=seed, mode=mode, variant=variant, exp_id=exp_id, )
def run_linear_ocm_exp(variant): from rlkit.tf.ddpg import DDPG from rlkit.envs.flattened_product_box import FlattenedProductBox from rlkit.exploration_strategies.ou_strategy import OUStrategy from rlkit.tf.policies.nn_policy import FeedForwardPolicy from rlkit.qfunctions.nn_qfunction import FeedForwardCritic from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ seed = variant['seed'] algo_params = variant['algo_params'] env_class = variant['env_class'] env_params = variant['env_params'] memory_dim = variant['memory_dim'] ou_params = variant['ou_params'] set_seed(seed) """ Code for running the experiment. """ env = env_class(**env_params) env = ContinuousMemoryAugmented( env, num_memory_states=memory_dim, ) env = FlattenedProductBox(env) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="policy", env_spec=env.spec, ) es = OUStrategy( env_spec=env.spec, **ou_params ) algorithm = DDPG( env, es, policy, qf, **algo_params ) algorithm.train()
def exp_fn(variant): exp_id = variant['exp_id'] print(variant.keys()) exp_prefix = variant['exp_name'] set_seed(exp_specs['seed']) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=variant) # run the experiment exp_return = experiment(variant) return exp_return
def run_linear_ocm_exp(variant): from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer, FiniteDifferenceHvp, ) from rlkit.envs.flattened_product_box import FlattenedProductBox from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from rlkit.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] set_seed(seed) onehot_dim = num_values + 1 """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) env = FlattenedProductBox(env) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = variant['optimizer_params'] trpo_params = variant['trpo_params'] algo = TRPO(env=env, policy=policy, baseline=baseline, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(**optimizer_params)), **trpo_params) algo.train()
def run_linear_ocm_exp(variant): from rlkit.tf.ddpg_ocm import DdpgOcm from rlkit.qfunctions.memory.mlp_memory_qfunction import MlpMemoryQFunction from rlkit.exploration_strategies.noop import NoopStrategy from rlkit.exploration_strategies.onehot_sampler import OneHotSampler from rlkit.exploration_strategies.product_strategy import ProductStrategy from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from rlkit.envs.memory.one_char_memory import OneCharMemoryEndOnly from rlkit.tf.policies.memory.linear_ocm_policy import LinearOcmPolicy from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] ddpg_params = variant['ddpg_params'] onehot_dim = num_values + 1 set_seed(seed) """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) policy = LinearOcmPolicy( name_or_scope="policy", memory_and_action_dim=onehot_dim, env_spec=env.spec, ) es = ProductStrategy([OneHotSampler(), NoopStrategy()]) qf = MlpMemoryQFunction( name_or_scope="critic", env_spec=env.spec, ) algorithm = DdpgOcm(env, es, policy, qf, **ddpg_params) algorithm.train()
def main(env_name, exp_name, seed, horizon, episodes, cpu, stochastic): if not cpu: set_gpu_mode(True) set_seed(seed) env = gym.make(env_name) env.seed(seed) env.set_eval() log_dir = settings.log_dir() if exp_name: policy = utils.load(log_dir, exp_name, cpu, stochastic) if stochastic: num_params = policy.num_params() else: num_params = policy.stochastic_policy.num_params() print(f"num params: {num_params}") else: policy = RandomPolicy(env) render = episodes == 0 reset_kwargs = {} def rollout_fn(): return multitask_rollout( env, policy, horizon, render, observation_key="observation", desired_goal_key="desired_goal", representation_goal_key="representation_goal", **reset_kwargs, ) if render: paths = utils.render(env, rollout_fn) else: success_rate, n_col, paths_states = utils.evaluate( rollout_fn, episodes) print(f"Success rate: {success_rate} - Collisions: {n_col}")
def run_linear_ocm_exp(variant): from rlkit.tf.ddpg import DDPG from rlkit.launchers.launcher_util import ( set_seed, ) from rlkit.exploration_strategies.ou_strategy import OUStrategy from rlkit.tf.policies.nn_policy import FeedForwardPolicy from rlkit.qfunctions.nn_qfunction import FeedForwardCritic """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] algo_params = variant['algo_params'] env_class = variant['env_class'] env_params = variant['env_params'] ou_params = variant['ou_params'] set_seed(seed) """ Code for running the experiment. """ env = env_class(**env_params) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="policy", env_spec=env.spec, ) es = OUStrategy(env_spec=env.spec, **ou_params) algorithm = DDPG(env, es, policy, qf, **algo_params) algorithm.train()
def main( env_name, exp_dir, seed, resume, mode, archi, epochs, reward_scale, intrinsic_reward_scale, hidden_dim, batch_size, learning_rate, n_layers, soft_target_tau, auto_alpha, alpha, frac_goal_replay, horizon, replay_buffer_size, snapshot_mode, snapshot_gap, cpu, ): valid_modes = ["vanilla", "her", "her+icm", "icm"] valid_archi = [ "mlp", "cnn", "pointnet", ] if mode not in valid_modes: raise ValueError(f"Unknown mode: {mode}") if archi not in valid_archi: raise ValueError(f"Unknown network archi: {archi}") machine_log_dir = settings.log_dir() exp_dir = os.path.join(machine_log_dir, exp_dir, f"seed{seed}") # multi-gpu and batch size scaling replay_buffer_size = replay_buffer_size num_expl_steps_per_train_loop = 1000 num_eval_steps_per_epoch = 1000 min_num_steps_before_training = 1000 num_trains_per_train_loop = 1000 # learning rate and soft update linear scaling policy_lr = learning_rate qf_lr = learning_rate variant = dict( env_name=env_name, algorithm="sac", version="normal", seed=seed, resume=resume, mode=mode, archi=archi, replay_buffer_kwargs=dict(max_replay_buffer_size=replay_buffer_size, ), algorithm_kwargs=dict( batch_size=batch_size, num_epochs=epochs, num_eval_steps_per_epoch=num_eval_steps_per_epoch, num_expl_steps_per_train_loop=num_expl_steps_per_train_loop, num_trains_per_train_loop=num_trains_per_train_loop, min_num_steps_before_training=min_num_steps_before_training, max_path_length=horizon, ), trainer_kwargs=dict( discount=0.99, soft_target_tau=soft_target_tau, target_update_period=1, policy_lr=policy_lr, qf_lr=qf_lr, reward_scale=reward_scale, use_automatic_entropy_tuning=auto_alpha, alpha=alpha, ), qf_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers), policy_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers), icm_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers), log_dir=exp_dir, ) if mode in ["her", "her+icm"]: variant["replay_buffer_kwargs"].update( dict( fraction_goals_rollout_goals=1 - frac_goal_replay, # equal to k = 4 in HER paper fraction_goals_env_goals=0, )) if mode in ["her+icm", "icm"]: #TODO: Add here ICM specific actions variant['trainer_kwargs'][ 'intrinsic_reward_scale'] = intrinsic_reward_scale if archi != "pointnet": raise Exception("ICM can only handle pointnet architecture") set_seed(seed) setup_logger_kwargs = { "exp_prefix": exp_dir, "variant": variant, "log_dir": exp_dir, "snapshot_mode": snapshot_mode, "snapshot_gap": snapshot_gap, } setup_logger(**setup_logger_kwargs) ptu.set_gpu_mode(not cpu, distributed_mode=False) print(f"Start training...") sac(variant)
# Arguments parser = argparse.ArgumentParser() parser.add_argument('-e', '--experiment', help='experiment specification file') args = parser.parse_args() with open(args.experiment, 'r') as spec_file: spec_string = spec_file.read() exp_specs = yaml.load(spec_string) exp_path = exp_specs['exp_path'] sub_exp = exp_specs['sub_exp'] sample_from_prior = exp_specs['sample_from_prior'] print('\n\nUSING GPU\n\n') ptu.set_gpu_mode(True) # seed set_seed(EVAL_SEED) # load the expert replay buffer expert_buffer = joblib.load(EXPERT_BUFFER_PATH)['meta_train']['context'] # do eval all_stats = [] try: alg = joblib.load(osp.join(exp_path, sub_exp, 'best_meta_test.pkl'))['algorithm'] print('\nLOADED ALGORITHM\n') if exp_specs['evaluating_np_airl']: alg.cuda() alg.main_policy.preprocess_model.cuda() else: alg.cuda() except Exception as e:
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- env_specs = { 'flat_repr': False, 'one_hot_repr': False, 'maze_h': 9, 'maze_w': 9, 'obs_h': 5, 'obs_w': 5, 'scale': 1, 'num_objs': 10 } maze_constructor = lambda: PartiallyObservedGrid(env_specs) data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- model = NewVRNN( next_obs_array[0].shape, acts_array[0].shape[0], exp_specs['vrnn_specs']['z_dim'], exp_specs['vrnn_specs']['x_encoder_specs'], exp_specs['vrnn_specs']['lstm_dim'], exp_specs['vrnn_specs']['decoder_part_specs'], ) if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] MSE_losses = [] KL_losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt total_ELBO.backward() model_optim.step() prev_h_batch = prev_h_batch.detach() prev_c_batch = prev_c_batch.detach() total_ELBO.detach() if iter_num % episode_length == 0: total_ELBO = 0. total_MSE = 0. total_KL = 0. prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim)) prev_c_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() prev_c_batch = prev_c_batch.cuda() train_mse_print = '\t'.join(MSE_losses) train_kl_print = '\t'.join(KL_losses) MSE_losses = [] KL_losses = [] obs_batch, act_batch = data_loader.get_next_batch() prior_mean, prior_log_cov, post_mean, post_log_cov, cur_z_sample, recon_mean, recon_log_cov, prev_h_batch, prev_c_batch = model(obs_batch, act_batch, prev_h_batch, prev_c_batch) elbo, KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True) mse = ((recon_mean - obs_batch)**2).mean() total_elbo = total_ELBO + elbo total_MSE = total_MSE + mse MSE_losses.append(mse) KL_losses.append(KL) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) model.eval() val_prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim)) val_prev_c_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() val_prev_c_batch = val_prev_c_batch.cuda() val_total_ELBO = 0. val_total_KL = 0. val_total_MSE = 0. val_MSE_losses = [] val_KL_losses = [] prior_imgs = [] post_imgs = [] obs_imgs = [] while val_data_loader.cur_t != val_data_loader.episode_length: obs_batch, act_batch = data_loader.get_next_batch() prior_mean, prior_log_cov, post_mean, post_log_cov, cur_z_sample, recon_mean, recon_log_cov, val_prev_h_batch, val_prev_c_batch = model(obs_batch, act_batch, val_prev_h_batch, val_prev_c_batch) elbo, KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True) mse = ((recon_mean - obs_batch)**2).mean() val_total_elbo = val_total_ELBO + elbo val_total_MSE = val_total_MSE + mse val_MSE_losses.append(mse) val_KL_losses.append(KL) prior_recon_mean, _ = model.get_obs_recon_dist(prior_mean, val_prev_h_batch) prior_recon_mean = np.transpose(prior_recon_mean[0].data.cpu().numpy(), (1,2,0)) prior_imgs.append(prior_recon_mean) post_recon_mean, _ = model.get_obs_recon_dist(post_mean, val_prev_h_batch) post_recon_mean = np.transpose(post_recon_mean[0].data.cpu().numpy(), (1,2,0)) post_imgs.append(post_recon_mean) obs = np.transpose(obs_batch[0].data.cpu().numpy(), (1,2,0)) obs_imgs.append(obs) post_prior_KL = model.compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov) val_elbo, val_KL = model.compute_ELBO( prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True ) val_total_elbo += val_elbo val_total_KL += post_prior_KL val_mse = ((recon_mean - obs_batch)**2).mean() val_total_MSE += val_mse val_MSE_losses.append(val_mse) val_total_KL.append(val_KL) val_mse_print = '\t'.join(val_MSE_losses) val_kl_print = '\t'.join(val_KL_losses) print('Avg Timestep MSE:\t%.4f' % (val_total_MSE)) print('Avg Timestep KL:\t%.4f' % (val_total_KL)) print('MSE:\t%s' % val_mse_print) print('KL:\t%s' % val_kl_print) # generate the gifs generate_gif( [prior_imgs, post_imgs, obs_imgs], ['Prior', 'Posterior', 'True Obs'], 'junk_vis/tiny_vrnn/%d.gif' % iter_num ) model.train()
def run_trained_policy(path): ptu.set_gpu_mode(True) variant = json.load(open(osp.join(path, "variant.json"), "r")) set_seed(variant["seed"]) variant = preprocess_variant_llraps(variant) env_suite = variant.get("env_suite", "kitchen") env_kwargs = variant["env_kwargs"] num_low_level_actions_per_primitive = variant[ "num_low_level_actions_per_primitive"] low_level_action_dim = variant["low_level_action_dim"] env_name = variant["env_name"] make_env_lambda = lambda: make_env(env_suite, env_name, env_kwargs) eval_envs = [make_env_lambda() for _ in range(1)] eval_env = DummyVecEnv(eval_envs, pass_render_kwargs=variant.get( "pass_render_kwargs", False)) discrete_continuous_dist = variant["actor_kwargs"][ "discrete_continuous_dist"] num_primitives = eval_envs[0].num_primitives continuous_action_dim = eval_envs[0].max_arg_len discrete_action_dim = num_primitives if not discrete_continuous_dist: continuous_action_dim = continuous_action_dim + discrete_action_dim discrete_action_dim = 0 action_dim = continuous_action_dim + discrete_action_dim obs_dim = eval_env.observation_space.low.size primitive_model = Mlp( output_size=variant["low_level_action_dim"], input_size=variant["model_kwargs"]["stochastic_state_size"] + variant["model_kwargs"]["deterministic_state_size"] + eval_env.envs[0].action_space.low.shape[0] + 1, hidden_activation=nn.ReLU, num_embeddings=eval_envs[0].num_primitives, embedding_dim=eval_envs[0].num_primitives, embedding_slice=eval_envs[0].num_primitives, **variant["primitive_model_kwargs"], ) world_model = LowlevelRAPSWorldModel( low_level_action_dim, image_shape=eval_envs[0].image_shape, primitive_model=primitive_model, **variant["model_kwargs"], ) actor = ActorModel( variant["model_kwargs"]["model_hidden_size"], world_model.feature_size, hidden_activation=nn.ELU, discrete_action_dim=discrete_action_dim, continuous_action_dim=continuous_action_dim, **variant["actor_kwargs"], ) actor.load_state_dict(torch.load(osp.join(path, "actor.ptc"))) world_model.load_state_dict(torch.load(osp.join(path, "world_model.ptc"))) actor.to(ptu.device) world_model.to(ptu.device) eval_policy = DreamerLowLevelRAPSPolicy( world_model, actor, obs_dim, action_dim, num_low_level_actions_per_primitive=num_low_level_actions_per_primitive, low_level_action_dim=low_level_action_dim, exploration=False, expl_amount=0.0, discrete_action_dim=discrete_action_dim, continuous_action_dim=continuous_action_dim, discrete_continuous_dist=discrete_continuous_dist, ) with torch.no_grad(): with torch.cuda.amp.autocast(): for step in range( 0, variant["algorithm_kwargs"]["max_path_length"] + 1): if step == 0: observation = eval_env.envs[0].reset() eval_policy.reset(observation.reshape(1, -1)) policy_o = (None, observation.reshape(1, -1)) reward = 0 else: high_level_action, _ = eval_policy.get_action(policy_o, ) observation, reward, done, info = eval_env.envs[0].step( high_level_action[0], ) low_level_obs = np.expand_dims( np.array(info["low_level_obs"]), 0) low_level_action = np.expand_dims( np.array(info["low_level_action"]), 0) policy_o = (low_level_action, low_level_obs) return reward
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = BasicDataLoader(next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = BasicDataLoader(next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- conv_encoder = nn.Sequential( nn.Conv2d(3, 32, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.Conv2d(32, 32, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU()) ae_dim = 128 z_dim = 128 pre_gru = nn.Sequential(nn.Linear(288 + z_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) post_fc = nn.Sequential(nn.Linear(ae_dim + 288 + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) post_mean_fc = nn.Linear(ae_dim, z_dim, bias=True) post_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True) prior_fc = nn.Sequential(nn.Linear(ae_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) prior_mean_fc = nn.Linear(ae_dim, z_dim, bias=True) prior_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True) gru = nn.GRUCell(ae_dim, ae_dim, bias=True) fc_decoder = nn.Sequential( nn.Linear(ae_dim + z_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, 288, bias=False), nn.BatchNorm1d(288), nn.ReLU(), ) conv_decoder = nn.Sequential( nn.ConvTranspose2d(32, 32, 1, stride=1, padding=0, output_padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.ConvTranspose2d(32, 32, 1, stride=1, padding=0, output_padding=0, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.Conv2d(32, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid()) if ptu.gpu_enabled(): conv_encoder.cuda() pre_gru.cuda() post_fc.cuda() post_mean_fc.cuda() post_log_cov_fc.cuda() prior_fc.cuda() prior_mean_fc.cuda() prior_log_cov_fc.cuda() gru.cuda() fc_decoder.cuda() conv_decoder.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam([ item for sublist in map(lambda x: list(x.parameters()), [ pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc, post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc, prior_mean_fc ]) for item in sublist ], lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] KLs = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss = loss + total_KL loss.backward() model_optim.step() loss = 0 total_KL = 0 prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], ae_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() if iter_num % exp_specs['freq_val'] == 0: train_loss_print = '\t'.join(losses) train_KLs_print = '\t'.join(KLs) losses = [] KLs = [] obs_batch, act_batch = data_loader.get_next_batch() enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1)) post_mean = post_mean_fc(hidden) post_log_cov = post_log_cov_fc(hidden) hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1)) prior_mean = prior_mean_fc(hidden) prior_log_cov = prior_log_cov_fc(hidden) recon = fc_decoder(torch.cat([prev_h_batch, act_batch, post_mean], 1)).view(obs_batch.size(0), 32, 3, 3) recon = conv_decoder(recon) hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1)) prev_h_batch = gru(hidden, prev_h_batch) KL = compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov) if iter_num % episode_length != 0: loss = loss + torch.sum( (obs_batch.view(obs_batch.size(0), -1) - recon.view(obs_batch.size(0), -1))**2, 1).mean() total_KL = total_KL + KL losses.append('%.4f' % ((obs_batch - recon)**2).mean()) KLs.append('%.4f' % KL) if iter_num % (50 * exp_specs['episode_length']) in range( 2 * exp_specs['episode_length']): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/full_KL_mem_grid_%d_recon.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/full_KL_mem_grid_%d_obs.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list( map(lambda x: x.eval(), [ pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc, post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc, prior_mean_fc ])) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], ae_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() val_losses = [] val_KLs = [] for i in range(freq_bptt): obs_batch, act_batch = data_loader.get_next_batch() enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1)) post_mean = post_mean_fc(hidden) post_log_cov = post_log_cov_fc(hidden) hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1)) prior_mean = prior_mean_fc(hidden) prior_log_cov = prior_log_cov_fc(hidden) recon = fc_decoder( torch.cat([prev_h_batch, act_batch, post_mean], 1)).view(obs_batch.size(0), 32, 3, 3) recon = conv_decoder(recon) hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1)) prev_h_batch = gru(hidden, prev_h_batch) val_losses.append('%.4f' % ((obs_batch - recon)**2).mean()) val_KL = compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov) val_KLs.append('%.4f' % val_KL) val_loss_print = '\t'.join(val_losses) val_KLs_print = '\t'.join(val_KLs) print('Val MSE:\t' + val_loss_print) print('Train MSE:\t' + train_loss_print) print('Val KL:\t\t' + val_KLs_print) print('Train KL:\t' + train_KLs_print) list( map(lambda x: x.train(), [ pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc, post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc, prior_mean_fc ]))
exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'], ) algorithm.to(ptu.device) algorithm.train() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Soft Actor Critic') parser.add_argument('--config', type=str, default="configs/lunarlander.yaml") parser.add_argument('--gpu', type=int, default=0, help="using cpu with -1") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() with open(args.config, 'r', encoding="utf-8") as f: variant = yaml.load(f, Loader=yaml.FullLoader) variant["seed"] = args.seed log_prefix = "_".join( ["sac", variant["env"][:-3].lower(), str(variant["version"])]) setup_logger(log_prefix, variant=variant, seed=args.seed) if args.gpu >= 0: ptu.set_gpu_mode(True, args.gpu) set_seed(args.seed) experiment(variant)
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = BasicDataLoader(next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = BasicDataLoader(next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- ae_dim = 128 encoder = nn.Sequential(nn.Linear(48, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU()) gru = nn.GRUCell(ae_dim, ae_dim, bias=True) decoder = nn.Sequential(nn.Linear(ae_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, 48), nn.Sigmoid()) if ptu.gpu_enabled(): encoder.cuda() gru.cuda() decoder.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(list(encoder.parameters()) + list(decoder.parameters()) + list(gru.parameters()), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss.backward() model_optim.step() loss = 0 prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], ae_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() if iter_num % exp_specs['freq_val'] == 0: train_loss_print = '\t'.join(losses) losses = [] obs_batch, act_batch = data_loader.get_next_batch() recon = decoder(torch.cat([prev_h_batch, act_batch], 1)).view(obs_batch.size()) enc = encoder(obs_batch.view(obs_batch.size(0), -1)) prev_h_batch = gru(enc, prev_h_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) if iter_num % freq_bptt != 0: loss = loss + ( (obs_batch - recon)**2).sum() / float(exp_specs['batch_size']) if iter_num % 250 in range(10): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/with_wd_1e-3_ae_recon_%d.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/with_wd_1e-3_ae_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list(map(lambda x: x.eval(), [encoder, decoder, gru])) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], ae_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() losses = [] for i in range(freq_bptt): obs_batch, act_batch = val_data_loader.get_next_batch() recon = decoder(torch.cat([val_prev_h_batch, act_batch], 1)).view(obs_batch.size()) enc = encoder(obs_batch.view(obs_batch.size(0), -1)) val_prev_h_batch = gru(enc, val_prev_h_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) loss_print = '\t'.join(losses) print('Val MSE:\t' + loss_print) print('Train MSE:\t' + train_loss_print) list(map(lambda x: x.train(), [encoder, decoder, gru]))
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) img_save_path = 'junk_vis/debug_more_proper' # Prep the data ----------------------------------------------------------- data_path = 'junk_vis/multi_mnist_data' canvas_size = 36 (X_train, _), (X_test, _) = multi_mnist(data_path, max_digits=1, canvas_size=canvas_size, seed=42, use_max=True) X_train = X_train[:, None, ...] X_test = X_test[:, None, ...] X_train, X_test = torch.FloatTensor(X_train) / 255.0, torch.FloatTensor( X_test) / 255.0 # np_imgs = np.load('/u/kamyar/dsprites-dataset/dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz')['imgs'] # np_imgs = None X_train = torch.clamp(X_train, 0.05, 0.95) X_test = torch.clamp(X_test, 0.05, 0.95) train_ds = TensorDataset(X_train) val_ds = TensorDataset(X_test) # Model Definition -------------------------------------------------------- if exp_specs['masked']: model = MaskedVAE( [1, canvas_size, canvas_size], exp_specs['vae_specs']['z_dim'], exp_specs['vae_specs']['encoder_specs'], exp_specs['vae_specs']['decoder_specs'], ) else: model = VAE( [1, canvas_size, canvas_size], exp_specs['vae_specs']['z_dim'], exp_specs['vae_specs']['encoder_specs'], exp_specs['vae_specs']['decoder_specs'], ) if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- global_iter = 0 for epoch in range(exp_specs['epochs']): train_loader = DataLoader(train_ds, batch_size=exp_specs['batch_size'], shuffle=True, num_workers=4, pin_memory=True, drop_last=True) for iter_num, img_batch in enumerate(train_loader): img_batch = img_batch[0] if ptu.gpu_enabled(): img_batch = img_batch.cuda() z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model( img_batch) elbo, KL = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov, img_batch, average_over_batch=True) loss = -1. * elbo loss.backward() model_optim.step() if global_iter % 1000 == 0: mse = ((recon_mean - img_batch)**2).mean() print('\nTraining Iter %d...' % global_iter) print('ELBO:\t%.4f' % elbo) print('MSE:\t%.4f' % mse) print('KL:\t%.4f' % KL) save_pytorch_tensor_as_img( img_batch[0].data.cpu(), os.path.join(img_save_path, '%d_train_img.png' % (global_iter))) save_pytorch_tensor_as_img( recon_mean[0].data.cpu(), os.path.join(img_save_path, '%d_train_recon.png' % (global_iter))) if exp_specs['masked']: save_pytorch_tensor_as_img( enc_mask[0].data.cpu(), os.path.join(img_save_path, '%d_train_enc_mask.png' % (global_iter))) # save_pytorch_tensor_as_img(dec_mask[0].data.cpu(), os.path.join(img_save_path, '%d_train_dec_mask.png'%(global_iter))) if global_iter % exp_specs['freq_val'] == 0: with torch.no_grad(): print('Validating Iter %d...' % global_iter) model.eval() idxs = np.random.choice(int(X_test.size(0)), size=exp_specs['batch_size'], replace=False) img_batch = X_test[idxs] if ptu.gpu_enabled(): img_batch = img_batch.cuda() z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model( img_batch) elbo, KL = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov, img_batch, average_over_batch=True) mse = ((recon_mean - img_batch)**2).mean() print('ELBO:\t%.4f' % elbo) print('MSE:\t%.4f' % mse) print('KL:\t%.4f' % KL) for i in range(1): save_pytorch_tensor_as_img( img_batch[i].data.cpu(), os.path.join(img_save_path, '%d_%d_img.png' % (global_iter, i))) save_pytorch_tensor_as_img( recon_mean[i].data.cpu(), os.path.join(img_save_path, '%d_%d_recon.png' % (global_iter, i))) if exp_specs['masked']: save_pytorch_tensor_as_img( enc_mask[i].data.cpu(), os.path.join( img_save_path, '%d_%d_enc_mask.png' % (global_iter, i))) # save_pytorch_tensor_as_img(dec_mask[i].data.cpu(), os.path.join(img_save_path, '%d_%d_dec_mask.png'%(global_iter, i))) model.train() global_iter += 1
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = BasicDataLoader(next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = BasicDataLoader(next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- ae_dim = 128 model = nn.Sequential(nn.Linear(48, ae_dim, bias=False), nn.BatchNorm1d(ae_dim, affine=False), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim, affine=False), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim, affine=False), nn.ReLU(), nn.Linear(ae_dim, 48), nn.Sigmoid()) if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: loss.backward() model_optim.step() loss = 0 obs_batch, act_batch = data_loader.get_next_batch() recon = model(obs_batch.view(obs_batch.size(0), -1)).view(obs_batch.size()) loss = loss + ( (obs_batch - recon)**2).sum() / float(exp_specs['batch_size']) if iter_num % 50 == 0: save_pytorch_tensor_as_img(recon[0].data.cpu(), 'junk_vis/ae_recon_%d.png' % iter_num) save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/ae_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) model.eval() obs_batch, act_batch = val_data_loader.get_next_batch() recon = model(obs_batch.view(obs_batch.size(0), -1)).view(obs_batch.size()) print('MSE:\t%.4f' % ((obs_batch - recon)**2).mean()) model.train()
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- env_specs = { 'flat_repr': False, 'one_hot_repr': False, 'maze_h': 9, 'maze_w': 9, 'obs_h': 5, 'obs_w': 5, 'scale': 4, 'num_objs': 10 } maze_constructor = lambda: PartiallyObservedGrid(env_specs) data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- model = NewVRNN( [3, env_specs['obs_h']*env_specs['scale'], env_specs['obs_w']*env_specs['scale']], exp_specs['vrnn_specs']['act_proc_dim'], exp_specs['vrnn_specs']['z_dim'], exp_specs['vrnn_specs']['pre_post_gru_dim'], exp_specs['vrnn_specs']['x_encoder_specs'], exp_specs['vrnn_specs']['decoder_part_specs'], ) if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] MSE_losses = [] KL_losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss = -1. * total_ELBO loss.backward() model_optim.step() prev_z.detach() total_ELBO.detach() if iter_num % episode_length == 0: total_ELBO = 0. total_MSE = 0. total_KL = 0. prev_z = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim)) if ptu.gpu_enabled(): prev_z = prev_z.cuda() train_mse_print = '\t'.join(MSE_losses) train_kl_print = '\t'.join(KL_losses) MSE_losses = [] KL_losses = [] obs_batch, act_batch = data_loader.get_next_batch() prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, prev_z = model(obs_batch, act_batch, prev_z) elbo, KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True) mse = ((recon_mean - obs_batch)**2).mean() # # Trying something with the prior # eps = Variable(torch.randn(prior_mean.size())) # if prior_mean.is_cuda: eps = eps.cuda() # prior_z_sample = prior_mean + eps*torch.exp(0.5 * prior_log_cov) # prior_recon_mean, _ = model.get_obs_recon_dist(prior_z_sample, act_batch) # prior_recon_log_prob = -0.5 * torch.sum((prior_recon_mean - obs_batch)**2) / float(obs_batch.size(0)) # elbo = elbo + prior_recon_log_prob # save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/tiny_vrnn_larger_cov_range_1_KL/obs_%d.png' % iter_num) MSE_losses.append('%.4f' % mse) KL_losses.append('%.4f' % KL) if iter_num % episode_length != 0: total_ELBO = total_ELBO + elbo total_MSE = total_MSE + mse if iter_num % exp_specs['freq_val'] == 0: with torch.no_grad(): print('\nValidating Iter %d...' % iter_num) model.eval() val_prev_z = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim)) if ptu.gpu_enabled(): val_prev_z = val_prev_z.cuda() val_total_ELBO = 0. val_total_KL = 0. val_total_MSE = 0. val_total_prior_MSE = 0. val_MSE_losses = [] val_prior_MSE_losses = [] val_KL_losses = [] prior_imgs = [] post_imgs = [] post_sample_imgs = [] obs_imgs = [] for _ in range(episode_length): obs_batch, act_batch = val_data_loader.get_next_batch() # save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/tiny_vrnn_larger_cov_range_1_KL/val_obs_%d.png' % i) prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, val_prev_z = model(obs_batch, act_batch, val_prev_z) val_elbo, val_KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True) val_mse = ((recon_mean - obs_batch)**2).mean() print('Mean:') print(torch.exp(post_mean)[0,:8].data.cpu().numpy()) print(torch.exp(prior_mean)[0,:8].data.cpu().numpy()) print('-----') print('Cov') print(torch.exp(post_log_cov)[0,:8].data.cpu().numpy()) print(torch.exp(prior_log_cov)[0,:8].data.cpu().numpy()) print('-----------------------') val_total_elbo = val_total_ELBO + val_elbo val_total_MSE = val_total_MSE + val_mse val_MSE_losses.append('%.4f' % val_mse) val_KL_losses.append('%.4f' % val_KL) prior_recon_mean, _ = model.get_obs_recon_dist(prior_mean, act_batch) val_prior_mse = ((prior_recon_mean - obs_batch)**2).mean() val_total_prior_MSE = val_total_prior_MSE + val_prior_mse val_prior_MSE_losses.append('%.4f' % val_prior_mse) prior_recon_mean = np.transpose(prior_recon_mean[0].data.cpu().numpy(), (1,2,0)) prior_imgs.append(prior_recon_mean) post_recon_mean, _ = model.get_obs_recon_dist(post_mean, act_batch) post_recon_mean = np.transpose(post_recon_mean[0].data.cpu().numpy(), (1,2,0)) post_imgs.append(post_recon_mean) sample_recon_mean = recon_mean sample_recon_mean = np.transpose(sample_recon_mean[0].data.cpu().numpy(), (1,2,0)) post_sample_imgs.append(sample_recon_mean) obs = np.transpose(obs_batch[0].data.cpu().numpy(), (1,2,0)) obs_imgs.append(obs) post_prior_KL = model.compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov) val_elbo, val_KL = model.compute_ELBO( prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True ) val_mse_print = '\t'.join(val_MSE_losses) val_prior_mse_print = '\t'.join(val_prior_MSE_losses) val_kl_print = '\t'.join(val_KL_losses) print('Avg Timestep MSE:\t\t%.4f' % (val_total_MSE)) print('Avg Timestep Prior MSE:\t%.4f' % (val_total_prior_MSE)) print('Avg Timestep KL:\t\t%.4f' % (val_total_KL)) print('MSE:\t\t%s' % val_mse_print) print('Prior MSE:\t%s' % val_prior_mse_print) print('KL:\t\t%s' % val_kl_print) # generate the gifs generate_gif( [post_sample_imgs, prior_imgs, post_imgs, obs_imgs], ['Posterior Sample', 'Prior', 'Posterior', 'True Obs'], 'junk_vis/vrnn_kl_0/%d.gif' % iter_num ) model.train()
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = RandomDataLoader(next_obs_array[:4000], acts_array[:4000], use_gpu=ptu.gpu_enabled()) val_data_loader = RandomDataLoader(next_obs_array[4000:], acts_array[4000:], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- if exp_specs['use_masked_vae']: model = VAESeg() else: model = VAE() if ptu.gpu_enabled(): model.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- for iter_num in range(int(float(exp_specs['max_iters']))): obs_batch, act_batch = data_loader.get_next_batch( exp_specs['batch_size']) if exp_specs['use_masked_vae']: recon_mean, recon_log_cov, z_mean, z_log_cov, mask = model( obs_batch) else: recon_mean, recon_log_cov, z_mean, z_log_cov = model(obs_batch) elbo = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov, obs_batch) KL = model.compute_KL(z_mean, z_log_cov) neg_elbo = -1. * elbo neg_elbo.backward() model_optim.step() if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) model.eval() obs_batch, act_batch = val_data_loader.get_next_batch( exp_specs['batch_size']) if exp_specs['use_masked_vae']: recon_mean, recon_log_cov, z_mean, z_log_cov, mask = model( obs_batch) mask = mask.repeat(1, 3, 1, 1) save_pytorch_tensor_as_img( mask[0].data.cpu(), 'junk_vis/mask_vae_%d.png' % iter_num) else: recon_mean, recon_log_cov, z_mean, z_log_cov = model(obs_batch) elbo = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov, obs_batch) KL = model.compute_KL(z_mean, z_log_cov) print('\nELBO:\t%.4f' % elbo) print('KL:\t%.4f' % KL) print('MSE:\t%.4f' % ((recon_mean - obs_batch)**2).mean()) print(obs_batch[0][0, :4, :4]) print(recon_mean[0][0, :4, :4]) print(recon_log_cov[0][0, :4, :4]) print(z_mean[0, 1]) print(torch.exp(z_log_cov[0, 1])) save_pytorch_tensor_as_img(recon_mean[0].data.cpu(), 'junk_vis/recon_vae_%d.png' % iter_num) save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/obs_vae_%d.png' % iter_num) model.train()
def main(): n_seeds = 1 mode = "here" exp_prefix = "dev-sl" # n_seeds = 10 # mode = "ec2" # exp_prefix = "6-2-sl-rwa-vs-lstm" env_noise_std = 0 memory_noise_std = 0 for rnn_cell_class, H in product( [SeparateRWALinearCell], [512], # [RWACell, LSTMCell, GRUCell], # [512, 256, 128, 64], ): # noinspection PyTypeChecker variant = dict( H=H, exp_prefix=exp_prefix, algo_params=dict( num_batches_per_epoch=10000 // 32, num_epochs=100, learning_rate=1e-3, batch_size=32, eval_num_episodes=64, lstm_state_size=10, rnn_cell_class=rnn_cell_class, rnn_cell_params=dict( # use_peepholes=True, state_is_flat_externally=False, output_dim=1, ), # rnn_cell_class=SeparateLstmLinearCell, # rnn_cell_params=dict( # use_peepholes=True, # env_noise_std=env_noise_std, # memory_noise_std=memory_noise_std, # output_nonlinearity=tf.nn.tanh, # # output_nonlinearity=tf.nn.softmax, # env_hidden_sizes=[], # ), softmax=False, ), version='Supervised Learning', env_class=HighLow, # env_class=OneCharMemory, ) exp_id = -1 for seed in range(n_seeds): exp_id += 1 set_seed(seed) variant['seed'] = seed variant['exp_id'] = exp_id run_experiment( bptt_launcher, exp_prefix=exp_prefix, seed=seed, mode=mode, variant=variant, exp_id=exp_id, )
def experiment(variant, seed): exp_id = "{}_{}".format(VARIANT_NAME, seed) print("\nExperiment: {}\nTask distribution: {}\n".format( exp_id, variant['env_name'])) # Randomization seed for reproducibility set_seed(seed) # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) target_entropy = -action_dim reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic( env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2], latent_dim=latent_dim, target_entropy=target_entropy, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_qf1.pth'))) algorithm.networks[-1].load_state_dict( torch.load(os.path.join(path, 'target_qf2.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory experiment_log_dir = setup_logger( variant['env_name'], tasks={i: env.tasks[i] for i in range(len(env.tasks))}, variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def experiment(variant): domain = variant['domain'] seed = variant['seed'] exp_mode = variant['exp_mode'] max_path_length = variant['algo_params']['max_path_length'] bcq_interactions = variant['bcq_interactions'] num_tasks = variant['num_tasks'] filename = f'./goals/{domain}-{exp_mode}-goals.pkl' idx_list, train_goals, wd_goals, ood_goals = pickle.load( open(filename, 'rb')) idx_list = idx_list[:num_tasks] sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}" buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir) print("Buffer directory: " + buffer_dir) # Load buffer bcq_buffers = [] buffer_loader_id_list = [] for i, idx in enumerate(idx_list): bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl' filename = os.path.join(buffer_dir, bname) rp_buffer = ReplayBuffer.remote( index=i, seed=seed, num_trans_context=variant['num_trans_context'], in_mdp_batch_size=variant['in_mdp_batch_size'], ) buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename)) bcq_buffers.append(rp_buffer) ray.get(buffer_loader_id_list) assert len(bcq_buffers) == len(idx_list) train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) set_seed(variant['seed']) # create multi-task environment and sample tasks env = env_producer(variant['domain'], seed=0) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic(env=env, train_goals=train_goals, wd_goals=wd_goals, ood_goals=ood_goals, replay_buffers=train_buffer, nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['domain'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def experiment(variant, args): # expl_env = NormalizedBoxEnv(gym.make(str(args.env))) # eval_env = NormalizedBoxEnv(gym.make(str(args.env))) expl_env = NormalizedBoxEnv(Mani2dEnv()) eval_env = NormalizedBoxEnv(Mani2dEnv()) setup_logger('DIAYNMUSIC_' + str(args.skill_dim) + '_' + args.env, variant=variant, snapshot_mode="last") ptu.set_gpu_mode(True) # optionally set the GPU (default=False) set_seed(args.seed) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size skill_dim = args.skill_dim M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + skill_dim, output_size=1, hidden_sizes=[M, M], ) df = FlattenMlp( input_size=obs_dim, output_size=skill_dim, hidden_sizes=[M, M], ) # smile estimator mi_etimator = ConcatCritic(obs_dim, M, 2, "relu") smile_clip = 1.0 policy = SkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim, action_dim=action_dim, hidden_sizes=[M, M], skill_dim=skill_dim) eval_policy = MakeDeterministic(policy) eval_path_collector = DIAYNMdpPathCollector( eval_env, eval_policy, ) expl_step_collector = MdpStepCollector( expl_env, policy, ) replay_buffer = DIAYNEnvReplayBuffer( variant['replay_buffer_size'], expl_env, skill_dim, ) trainer = DIAYNMUSICTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, df=df, target_qf1=target_qf1, target_qf2=target_qf2, mi_estimator=mi_etimator, smile_clip=smile_clip, prio_extrio_bound=6, **variant['trainer_kwargs']) algorithm = DIAYNTorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def take_step_in_env_per_thread(pid, queue, env, policy, render, reward_scale, steps, max_path_length, n_env_steps_total): set_seed(pid) n_rollouts_total = 0 current_path_builder = PathBuilder() exploration_paths = [] replay_samples = { 'observations': [], 'actions': [], 'rewards': [], 'next_observations': [], 'terminals': [], 'agent_infos': [], 'env_infos': [], } policy.reset() observation = env.reset() policy.set_num_steps_total(n_env_steps_total) for _ in range(steps): action, agent_info = policy.get_action(observation) if pid == 0 and render: env.render() next_ob, raw_reward, terminal, env_info = env.step(action) reward = np.array([raw_reward * reward_scale]) terminal = np.array([terminal]) replay_samples['observations'].append(observation) replay_samples['actions'].append(action) replay_samples['rewards'].append(reward) replay_samples['next_observations'].append(next_ob) replay_samples['terminals'].append(terminal) replay_samples['agent_infos'].append(agent_info) replay_samples['env_infos'].append(env_info) current_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) if terminal or len(current_path_builder) >= max_path_length: # cannot let replay buffer terminate episode n_rollouts_total += 1 if len(current_path_builder) > 0: exploration_paths.append( current_path_builder.get_all_stacked()) current_path_builder = PathBuilder() policy.reset() observation = env.reset() else: observation = next_ob if queue is None: return exploration_paths, replay_samples, n_rollouts_total else: queue.put([pid, exploration_paths, replay_samples, n_rollouts_total])
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- env_specs = { 'flat_repr': False, 'one_hot_repr': False, 'maze_h': 9, 'maze_w': 9, 'obs_h': 5, 'obs_w': 5, 'scale': 4, 'num_objs': 10 } maze_constructor = lambda: PartiallyObservedGrid(env_specs) data_loader = VerySpecificOnTheFLyDataLoader(maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = VerySpecificOnTheFLyDataLoader( maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- conv_channels = 32 conv_encoder = nn.Sequential( nn.Conv2d(3, conv_channels, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, conv_channels, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU()) ae_dim = 256 gru_dim = 512 img_h = 5 flat_inter_img_dim = img_h * img_h * conv_channels act_dim = 64 act_proc = nn.Linear(4, act_dim, bias=True) fc_encoder = nn.Sequential( nn.Linear(flat_inter_img_dim + act_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), # nn.Linear(ae_dim, ae_dim, bias=False), # nn.BatchNorm1d(ae_dim), # nn.ReLU(), # nn.Linear(ae_dim, ae_dim, bias=False), # nn.BatchNorm1d(ae_dim), # nn.ReLU(), # nn.Linear(ae_dim, ae_dim, bias=False), # nn.BatchNorm1d(ae_dim), # nn.ReLU() ) gru = nn.LSTMCell(ae_dim, gru_dim, bias=True) fc_decoder = nn.Sequential( nn.Linear(gru_dim + act_dim, 256, bias=False), nn.BatchNorm1d(256), nn.ReLU(), nn.Linear(256, 2 * flat_inter_img_dim, bias=False), nn.BatchNorm1d(2 * flat_inter_img_dim), nn.ReLU(), # # nn.Linear(ae_dim, ae_dim, bias=False), # # nn.BatchNorm1d(ae_dim), # # nn.ReLU(), # # nn.Linear(ae_dim, ae_dim, bias=False), # # nn.BatchNorm1d(ae_dim), # # nn.ReLU(), # nn.Linear(ae_dim, flat_inter_img_dim, bias=False), # nn.BatchNorm1d(flat_inter_img_dim), # nn.ReLU(), ) conv_decoder = nn.Sequential( nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(64), nn.ReLU(), nn.ConvTranspose2d(64, 64, 4, stride=2, padding=1, output_padding=0, bias=False), nn.BatchNorm2d(64), nn.ReLU(), nn.ConvTranspose2d(64, 64, 4, stride=2, padding=1, output_padding=0, bias=False), nn.BatchNorm2d(64), nn.ReLU(), # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), # nn.BatchNorm2d(conv_channels), # nn.ReLU(), ) mean_decoder = nn.Sequential( nn.Conv2d(64, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid()) log_cov_decoder = nn.Sequential( nn.Conv2d(64, 3, 1, stride=1, padding=0, bias=True), ) if ptu.gpu_enabled(): conv_encoder.cuda() fc_encoder.cuda() gru.cuda() fc_decoder.cuda() conv_decoder.cuda() mean_decoder.cuda() log_cov_decoder.cuda() act_proc.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam([ item for sublist in map(lambda x: list(x.parameters()), [ fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder, mean_decoder, log_cov_decoder ]) for item in sublist ], lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss.backward() model_optim.step() prev_h_batch = prev_h_batch.detach() prev_c_batch = prev_c_batch.detach() loss = 0 if iter_num % episode_length == 0: prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_dim)) prev_c_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() prev_c_batch = prev_c_batch.cuda() train_loss_print = '\t'.join(losses) losses = [] obs_batch, act_batch = data_loader.get_next_batch() act_batch = act_proc(act_batch) hidden = fc_decoder(torch.cat([prev_h_batch, act_batch], 1)).view(obs_batch.size(0), 64, img_h, img_h) hidden = conv_decoder(hidden) recon = mean_decoder(hidden) log_cov = log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = conv_encoder(obs_batch) enc = enc.view(obs_batch.size(0), -1) enc = fc_encoder(torch.cat([enc, act_batch], 1)) prev_h_batch, prev_c_batch = gru(enc, (prev_h_batch, prev_c_batch)) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) if iter_num % episode_length != 0: loss = loss + ( (obs_batch - recon)**2).sum() / float(exp_specs['batch_size']) # loss = loss - compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) if iter_num % (500 * episode_length) in range(2 * episode_length): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/debug_2_good_acts_on_the_fly_pogrid_len_8_scale_4/rnn_recon_%d.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/debug_2_good_acts_on_the_fly_pogrid_len_8_scale_4/rnn_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list( map(lambda x: x.eval(), [ fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder, mean_decoder, log_cov_decoder, act_proc ])) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_dim)) val_prev_c_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() val_prev_c_batch = val_prev_c_batch.cuda() losses = [] for i in range(episode_length): obs_batch, act_batch = val_data_loader.get_next_batch() act_batch = act_proc(act_batch) hidden = fc_decoder(torch.cat([val_prev_h_batch, act_batch], 1)).view(obs_batch.size(0), 64, img_h, img_h) hidden = conv_decoder(hidden) recon = mean_decoder(hidden) log_cov = log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) enc = fc_encoder(torch.cat([enc, act_batch], 1)) val_prev_h_batch, val_prev_c_batch = gru( enc, (val_prev_h_batch, val_prev_c_batch)) # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) loss_print = '\t'.join(losses) print('Val MSE:\t' + loss_print) print('Train MSE:\t' + train_loss_print) list( map(lambda x: x.train(), [ fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder, mean_decoder, log_cov_decoder, act_proc ]))
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = BasicDataLoader( next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = BasicDataLoader( next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- conv_channels = 64 conv_encoder = nn.Sequential( nn.Conv2d(3, conv_channels, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, conv_channels, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU() ) ae_dim = 128 gru_dim = 512 img_h = 5 flat_inter_img_dim = img_h * img_h * conv_channels fc_encoder = nn.Sequential( nn.Linear(flat_inter_img_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU() ) gru = nn.GRUCell( ae_dim, gru_dim, bias=True ) fc_decoder = nn.Sequential( nn.Linear(gru_dim + 4, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, ae_dim, bias=False), nn.BatchNorm1d(ae_dim), nn.ReLU(), nn.Linear(ae_dim, flat_inter_img_dim, bias=False), nn.BatchNorm1d(flat_inter_img_dim), nn.ReLU(), ) conv_decoder = nn.Sequential( nn.ConvTranspose2d(conv_channels, conv_channels, 1, stride=1, padding=0, output_padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.ConvTranspose2d(conv_channels, conv_channels, 1, stride=1, padding=0, output_padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid() ) if ptu.gpu_enabled(): conv_encoder.cuda() fc_encoder.cuda() gru.cuda() fc_decoder.cuda() conv_decoder.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam( [ item for sublist in map( lambda x: list(x.parameters()), [fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder] ) for item in sublist ], lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd']) ) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss.backward() model_optim.step() prev_h_batch = prev_h_batch.detach() loss = 0 if iter_num % episode_length == 0: prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], gru_dim)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() if iter_num % exp_specs['freq_val'] == 0: train_loss_print = '\t'.join(losses) losses = [] obs_batch, act_batch = data_loader.get_next_batch() recon = fc_decoder(torch.cat([prev_h_batch, act_batch], 1)).view(obs_batch.size(0), conv_channels, img_h, img_h) recon = conv_decoder(recon) enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) enc = fc_encoder(enc) prev_h_batch = gru(enc, prev_h_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) if iter_num % episode_length != 0: loss = loss + ((obs_batch - recon)**2).sum()/float(exp_specs['batch_size']) if iter_num % (50*episode_length) in range(2*episode_length): save_pytorch_tensor_as_img(recon[0].data.cpu(), 'junk_vis/fixed_colors_simple_maze_5_h/rnn_recon_%d.png' % iter_num) save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/fixed_colors_simple_maze_5_h/rnn_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list(map(lambda x: x.eval(), [fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder])) val_prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], gru_dim)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() losses = [] for i in range(episode_length): obs_batch, act_batch = data_loader.get_next_batch() recon = fc_decoder(torch.cat([val_prev_h_batch, act_batch], 1)).view(obs_batch.size(0), conv_channels, img_h, img_h) recon = conv_decoder(recon) enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1) enc = fc_encoder(enc) val_prev_h_batch = gru(enc, val_prev_h_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) loss_print = '\t'.join(losses) print('Val MSE:\t' + loss_print) print('Train MSE:\t' + train_loss_print) list(map(lambda x: x.train(), [fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder]))
def experiment(exp_specs): ptu.set_gpu_mode(exp_specs['use_gpu']) # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Prep the data ----------------------------------------------------------- replay_dict = joblib.load(exp_specs['replay_dict_path']) next_obs_array = replay_dict['next_observations'] acts_array = replay_dict['actions'] data_loader = BasicDataLoader(next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) val_data_loader = BasicDataLoader(next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled()) # Model Definition -------------------------------------------------------- conv_channels = 32 conv_encoder = nn.Sequential( nn.Conv2d(3, conv_channels, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), nn.Conv2d(conv_channels, conv_channels, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU()) gru_channels = 128 inter_h = 5 act_channels = 4 act_proc = nn.Linear(4, act_channels * inter_h * inter_h, bias=True) pre_gru_conv = nn.Sequential( nn.Conv2d(act_channels + conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), ) gru = ConvGRUCell(conv_channels, gru_channels, 3) post_gru_conv = nn.Sequential( nn.Conv2d(act_channels + gru_channels, conv_channels, 3, stride=1, padding=1, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), ) conv_decoder = nn.Sequential( nn.ConvTranspose2d(conv_channels, conv_channels, 4, stride=2, padding=1, output_padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), # nn.BatchNorm2d(conv_channels), # nn.ReLU(), nn.ConvTranspose2d(conv_channels, conv_channels, 4, stride=2, padding=1, output_padding=0, bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(), # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False), # nn.BatchNorm2d(conv_channels), # nn.ReLU(), ) mean_decoder = nn.Sequential( nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid()) log_cov_decoder = nn.Sequential( nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), ) if ptu.gpu_enabled(): conv_encoder.cuda() pre_gru_conv.cuda() gru.cuda() post_gru_conv.cuda() conv_decoder.cuda() mean_decoder.cuda() log_cov_decoder.cuda() act_proc.cuda() # Optimizer --------------------------------------------------------------- model_optim = Adam([ item for sublist in map(lambda x: list(x.parameters()), [ conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder, mean_decoder, log_cov_decoder ]) for item in sublist ], lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])) # ------------------------------------------------------------------------- freq_bptt = exp_specs['freq_bptt'] episode_length = exp_specs['episode_length'] losses = [] for iter_num in range(int(float(exp_specs['max_iters']))): if iter_num % freq_bptt == 0: if iter_num > 0: # loss = loss / freq_bptt loss.backward() model_optim.step() prev_h_batch = prev_h_batch.detach() loss = 0 if iter_num % episode_length == 0: prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_channels, inter_h, inter_h)) if ptu.gpu_enabled(): prev_h_batch = prev_h_batch.cuda() train_loss_print = '\t'.join(losses) losses = [] obs_batch, act_batch = data_loader.get_next_batch() act_batch = act_proc(act_batch).view(act_batch.size(0), act_channels, inter_h, inter_h) hidden = post_gru_conv(torch.cat([prev_h_batch, act_batch], 1)) hidden = conv_decoder(hidden) recon = mean_decoder(hidden) log_cov = log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = conv_encoder(obs_batch) enc = pre_gru_conv(torch.cat([enc, act_batch], 1)) prev_h_batch = gru(enc, prev_h_batch) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) if iter_num % episode_length != 0: loss = loss + ( (obs_batch - recon)**2).sum() / float(exp_specs['batch_size']) # loss = loss + compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) if iter_num % (500 * episode_length) in range(2 * episode_length): save_pytorch_tensor_as_img( recon[0].data.cpu(), 'junk_vis/conv_gru_pogrid_len_8_scale_4/rnn_recon_%d.png' % iter_num) save_pytorch_tensor_as_img( obs_batch[0].data.cpu(), 'junk_vis/conv_gru_pogrid_len_8_scale_4/rnn_obs_%d.png' % iter_num) if iter_num % exp_specs['freq_val'] == 0: print('\nValidating Iter %d...' % iter_num) list( map(lambda x: x.eval(), [ conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder, mean_decoder, log_cov_decoder, act_proc ])) val_prev_h_batch = Variable( torch.zeros(exp_specs['batch_size'], gru_channels, inter_h, inter_h)) if ptu.gpu_enabled(): val_prev_h_batch = val_prev_h_batch.cuda() losses = [] for i in range(episode_length): obs_batch, act_batch = val_data_loader.get_next_batch() act_batch = act_proc(act_batch).view(act_batch.size(0), act_channels, inter_h, inter_h) hidden = post_gru_conv( torch.cat([val_prev_h_batch, act_batch], 1)) hidden = conv_decoder(hidden) recon = mean_decoder(hidden) log_cov = log_cov_decoder(hidden) log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX) enc = conv_encoder(obs_batch) enc = pre_gru_conv(torch.cat([enc, act_batch], 1)) val_prev_h_batch = gru(enc, val_prev_h_batch) # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size']) losses.append('%.4f' % ((obs_batch - recon)**2).mean()) loss_print = '\t'.join(losses) print('Val MSE:\t' + loss_print) print('Train MSE:\t' + train_loss_print) list( map(lambda x: x.train(), [ conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder, mean_decoder, log_cov_decoder, act_proc ]))
training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1 if __name__ == '__main__': # Arguments parser = argparse.ArgumentParser() parser.add_argument('-e', '--experiment', help='experiment specification file') args = parser.parse_args() with open(args.experiment, 'r') as spec_file: spec_string = spec_file.read() exp_specs = yaml.load(spec_string) exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) experiment(exp_specs)
def experiment(exp_specs): # Set up logging ---------------------------------------------------------- exp_id = exp_specs['exp_id'] exp_prefix = exp_specs['exp_name'] seed = exp_specs['seed'] set_seed(seed) setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs) # Load the data ----------------------------------------------------------- extra_data_path = exp_specs['extra_data_path'] train_replay_buffer = joblib.load(extra_data_path)['replay_buffer'] train_replay_buffer.change_max_size_to_cur_size() train_replay_buffer._next_obs = train_replay_buffer._next_obs[:,exp_specs['extra_obs_dim']:] if exp_specs['remove_env_info']: train_replay_buffer._observations = train_replay_buffer._observations[:,exp_specs['extra_obs_dim']:] else: if exp_specs['normalize_env_info']: low, high = exp_specs['env_info_range'][0], exp_specs['env_info_range'][1] train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] -= (low + high)/2.0 train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] /= (high - low)/2.0 print('\nRewards: {} +/- {}'.format( np.mean(train_replay_buffer._rewards), np.std(train_replay_buffer._rewards) )) next_obs_mean = np.mean(train_replay_buffer._next_obs, 0) next_obs_std = np.std(train_replay_buffer._next_obs, 0) print('\nNext Obs:\n{}\n+/-\n{}'.format( next_obs_mean, next_obs_std )) print('\nAvg Next Obs Square Norm: {}'.format( np.mean(np.linalg.norm(train_replay_buffer._next_obs, axis=1)**2) )) sample_batch = train_replay_buffer.random_batch(exp_specs['train_batch_size']) obs_dim = sample_batch['observations'].shape[-1] act_dim = sample_batch['actions'].shape[-1] val_replay_buffer = SimpleReplayBuffer(exp_specs['val_set_size'], obs_dim, act_dim) val_replay_buffer.set_buffer_from_dict( train_replay_buffer.sample_and_remove(exp_specs['val_set_size']) ) if exp_specs['train_from_beginning_transitions']: trans_dict = dict( observations=train_replay_buffer._observations[:exp_specs['train_set_size']], actions=train_replay_buffer._actions[:exp_specs['train_set_size']], rewards=train_replay_buffer._rewards[:exp_specs['train_set_size']], terminals=train_replay_buffer._terminals[:exp_specs['train_set_size']], next_observations=train_replay_buffer._next_obs[:exp_specs['train_set_size']], ) train_replay_buffer.set_buffer_from_dict(trans_dict) else: train_replay_buffer.set_buffer_from_dict( train_replay_buffer.sample_and_remove(exp_specs['train_set_size']) ) # Model Definitions ------------------------------------------------------- if exp_specs['remove_env_info']: output_dim = [obs_dim + 1] else: output_dim = [obs_dim - exp_specs['extra_obs_dim'] + 1] model = GenericMap( [obs_dim + act_dim], output_dim, siamese_input=False, siamese_output=False, num_hidden_layers=exp_specs['num_hidden_layers'], hidden_dim=exp_specs['hidden_dim'], act='relu', use_bn=True, deterministic=True ) model_optim = Adam(model.parameters(), lr=float(exp_specs['lr'])) # Train ------------------------------------------------------------------- model.train() for iter_num in range(exp_specs['max_iters']): model_optim.zero_grad() batch = train_replay_buffer.random_batch(exp_specs['train_batch_size']) batch = convert_numpy_dict_to_pytorch(batch) inputs = Variable(torch.cat([batch['observations'], batch['actions']], -1)) outputs = Variable(torch.cat([batch['next_observations'], batch['rewards']], -1)) preds = model([inputs])[0] if exp_specs['residual']: # residual for observations preds = preds + Variable( torch.cat( [ batch['observations'][:,exp_specs['extra_obs_dim']:], torch.zeros(exp_specs['train_batch_size'], 1) ], 1) ) loss = torch.mean(torch.sum((outputs - preds)**2, -1)) loss.backward() model_optim.step() if iter_num % exp_specs['freq_val'] == 0: model.eval() val_batch = val_replay_buffer.random_batch(exp_specs['val_batch_size']) val_batch = convert_numpy_dict_to_pytorch(val_batch) inputs = Variable(torch.cat([val_batch['observations'], val_batch['actions']], -1)) outputs = Variable(torch.cat([val_batch['next_observations'], val_batch['rewards']], -1)) # print(exp_specs['remove_env_info']) # print(inputs) # print(outputs) # sleep(5) preds = model([inputs])[0] if exp_specs['residual']: # residual for observations preds = preds + Variable( torch.cat( [ val_batch['observations'][:,exp_specs['extra_obs_dim']:], torch.zeros(exp_specs['train_batch_size'], 1) ], 1) ) loss = torch.mean(torch.sum((outputs - preds)**2, -1)) next_obs_loss = torch.mean(torch.sum((outputs[:,:-1] - preds[:,:-1])**2, -1)) rew_loss = torch.mean(torch.sum((outputs[:,-1:] - preds[:,-1:])**2, -1)) print('\n') print('-'*20) logger.record_tabular('Iter', iter_num) logger.record_tabular('Loss', loss.data[0]) logger.record_tabular('Obs Loss', next_obs_loss.data[0]) logger.record_tabular('Rew Loss', rew_loss.data[0]) logger.dump_tabular(with_prefix=False, with_timestamp=False) model.train()