def run_task(vv): set_gpu_mode(vv['gpu']) env_name = None goals = np.array(vv['goals']) env = lambda: SwimmerEnv( vv['frame_skip'], goals=goals, include_rstate=False) obs_dim = int(env().observation_space.shape[0]) action_dim = int(env().action_space.shape[0]) vv['block_config'] = [env().reset().tolist(), vv['goals']] print(vv['block_config']) path_len = vv['path_len'] data_path = vv['initial_data_path'] use_actions = vv['use_actions'] dummy = np.zeros((1, path_len + 1, obs_dim + action_dim)) train_data, test_data = dummy, dummy train_dataset = WheeledContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len, env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'], buffer_size=vv['buffer_size'], pltidx=[-2, -1]) test_dataset = WheeledContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len, env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'] // 9, buffer_size=vv['buffer_size'] // 9, pltidx=[-2, -1]) dummy_dataset = WheeledContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len, env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'], buffer_size=vv['buffer_size'], pltidx=[-2, -1]) train_dataset.clear() test_dataset.clear() dummy_dataset.clear() latent_dim = vv['latent_dim'] policy_rnn_hidden_dim = vv['policy_rnn_hidden_dim'] rnn_hidden_dim = vv['decoder_rnn_hidden_dim'] step_dim = obs_dim rnn_hidden_dim = 256 if vv['encoder_type'] == 'mlp': encoder = GaussianNetwork(mean_network=MLP( (path_len + 1) * step_dim, latent_dim, hidden_sizes=vv['encoder_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=MLP( (path_len + 1) * step_dim, latent_dim)) elif vv['encoder_type'] == 'lstm': encoder = GaussianBidirectionalNetwork( input_dim=step_dim, hidden_dim=rnn_hidden_dim, num_layers=2, mean_network=MLP(2 * rnn_hidden_dim, latent_dim), log_var_network=MLP(2 * rnn_hidden_dim, latent_dim)) if vv['decoder_var_type'] == 'param': decoder_log_var_network = Parameter(latent_dim, step_dim, init=np.log(0.1)) else: decoder_log_var_network = MLP(rnn_hidden_dim, step_dim) if vv['decoder_type'] == 'grnn': decoder = GaussianRecurrentNetwork( recurrent_network=RNN( nn.LSTM(step_dim + latent_dim, rnn_hidden_dim), rnn_hidden_dim), mean_network=MLP(rnn_hidden_dim, step_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=decoder_log_var_network, path_len=path_len, output_dim=step_dim, min_var=1e-4, ) elif vv['decoder_type'] == 'gmlp': decoder = GaussianNetwork( mean_network=MLP(latent_dim, path_len * step_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=Parameter(latent_dim, path_len * step_dim, init=np.log(0.1)), min_var=1e-4) elif vv['decoder_type'] == 'mixedrnn': gauss_output_dim = 10 cat_output_dim = 5 decoder = MixedRecurrentNetwork( recurrent_network=RNN( nn.LSTM(step_dim + latent_dim, rnn_hidden_dim), rnn_hidden_dim), mean_network=MLP(rnn_hidden_dim, gauss_output_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU), prob_network=MLP(rnn_hidden_dim, cat_output_dim, final_act=nn.Softmax), log_var_network=Parameter(latent_dim, gauss_output_dim, init=np.log(0.1)), path_len=path_len, output_dim=step_dim, min_var=1e-4, gaussian_output_dim=gauss_output_dim, cat_output_dim=cat_output_dim) if vv['policy_type'] == 'grnn': policy = GaussianRecurrentPolicy( recurrent_network=RNN( nn.LSTM(obs_dim + latent_dim, policy_rnn_hidden_dim), policy_rnn_hidden_dim), mean_network=MLP(policy_rnn_hidden_dim, action_dim, hidden_act=nn.ReLU), log_var_network=Parameter(obs_dim + latent_dim, action_dim, init=np.log(1)), path_len=path_len, output_dim=action_dim) elif vv['policy_type'] == 'gmlp': policy = GaussianNetwork( mean_network=MLP(obs_dim + latent_dim, action_dim, hidden_sizes=vv['policy_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=Parameter(obs_dim + latent_dim, action_dim, init=np.log(1))) policy_ex = GaussianNetwork(mean_network=MLP( obs_dim, action_dim, hidden_sizes=vv['policy_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=Parameter(obs_dim, action_dim, init=np.log(1))) elif vv['policy_type'] == 'crnn': policy = RecurrentCategoricalPolicy( recurrent_network=RNN( nn.LSTM(obs_dim + latent_dim, policy_rnn_hidden_dim), policy_rnn_hidden_dim), prob_network=MLP(policy_rnn_hidden_dim, action_dim, hidden_sizes=vv['policy_hidden_sizes'], final_act=nn.Softmax), path_len=path_len, output_dim=action_dim) elif vv['policy_type'] == 'cmlp': policy = CategoricalNetwork(prob_network=MLP(obs_dim + latent_dim, action_dim, hidden_sizes=(400, 300, 200), hidden_act=nn.ReLU, final_act=nn.Softmax), output_dim=action_dim) policy_ex = CategoricalNetwork(prob_network=MLP(obs_dim, action_dim, hidden_sizes=(400, 300, 200), hidden_act=nn.ReLU, final_act=nn.Softmax), output_dim=action_dim) elif vv['policy_type'] == 'lstm': policy = LSTMPolicy(input_dim=obs_dim + latent_dim, hidden_dim=rnn_hidden_dim, num_layers=2, output_dim=action_dim) vae = TrajVAEBC(encoder=encoder, decoder=decoder, latent_dim=latent_dim, step_dim=step_dim, feature_dim=train_dataset.obs_dim, env=env, path_len=train_dataset.path_len, init_kl_weight=vv['kl_weight'], max_kl_weight=vv['kl_weight'], kl_mul=1.03, loss_type=vv['vae_loss_type'], lr=vv['vae_lr'], obs_dim=obs_dim, act_dim=action_dim, policy=policy, bc_weight=vv['bc_weight']) baseline = ZeroBaseline() policy_algo = PPO(env, env_name, policy, baseline=baseline, obs_dim=obs_dim, action_dim=action_dim, max_path_length=path_len, center_adv=True, optimizer=optim.Adam(policy.get_params(), vv['policy_lr'], eps=1e-5), use_gae=vv['use_gae'], epoch=10, ppo_batch_size=200) baseline_ex = ZeroBaseline() policy_ex_algo = PPO(env, env_name, policy_ex, baseline=baseline_ex, obs_dim=obs_dim, action_dim=action_dim, max_path_length=path_len, center_adv=True, optimizer=optim.Adam(policy_ex.get_params(), vv['policy_lr'], eps=1e-5), use_gae=vv['use_gae'], epoch=10, ppo_batch_size=200, entropy_bonus=vv['entropy_bonus']) if vv['load_models_dir'] is not None: dir = getcwd( ) + "/research/lang/traj2vecv3_jd/" + vv['load_models_dir'] itr = vv['load_models_idx'] encoder.load_state_dict(torch.load(dir + '/encoder_%d.pkl' % itr)) decoder.load_state_dict(torch.load(dir + '/decoder_%d.pkl' % itr)) policy.load_state_dict(torch.load(dir + '/policy_%d.pkl' % itr)) policy_ex.load_state_dict(torch.load(dir + '/policy_ex_%d.pkl' % itr)) vae.optimizer.load_state_dict( torch.load(dir + '/vae_optimizer_%d.pkl' % itr)) policy_algo.optimizer.load_state_dict( torch.load(dir + '/policy_optimizer_%d.pkl' % itr)) rf = lambda obs, rstate: reward_fn(obs, rstate, goals, 3) mpc_explore = 4000 if vv['path_len'] <= 50: mpc_explore *= 2 vaepd = VAEPDEntropy(env, env_name, policy, policy_ex, encoder, decoder, path_len, obs_dim, action_dim, step_dim, policy_algo, policy_ex_algo, train_dataset, latent_dim, vae, batch_size=400, block_config=vv['block_config'], plan_horizon=vv['mpc_plan'], max_horizon=vv['mpc_max'], mpc_batch=vv['mpc_batch'], rand_per_mpc_step=vv['mpc_explore_step'], mpc_explore=mpc_explore, mpc_explore_batch=1, reset_ent=vv['reset_ent'], vae_train_steps=vv['vae_train_steps'], mpc_explore_len=vv['mpc_explore_len'], true_reward_scale=vv['true_reward_scale'], discount_factor=vv['discount_factor'], reward_fn=(rf, init_rstate)) vaepd.train(train_dataset, test_dataset=test_dataset, dummy_dataset=dummy_dataset, plot_step=10, max_itr=vv['max_itr'], record_stats=True, print_step=1000, save_step=2, start_itr=0, train_vae_after_add=vv['train_vae_after_add'], joint_training=vv['joint_training'])
def run_task(vv): set_gpu_mode(vv['gpu']) env_name = vv['env_name'] env = make_env(env_name, 1, 0, '/tmp/gym', kwargs=dict(border=vv['block_config'][2])) obs_dim = int(env().observation_space.shape[0]) action_dim = int(env().action_space.n) path_len = vv['path_len'] data_path = None # True so that behavioral cloning has access to actions use_actions = True #create a dummy datset since we initialize with no data dummy = np.zeros((1, path_len + 1, obs_dim + action_dim)) train_data, test_data = dummy, dummy train_dataset = PlayPenContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len, env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'], buffer_size=vv['buffer_size']) #validation set for vae training test_dataset = PlayPenContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len, env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'] // 9, buffer_size=vv['buffer_size'] // 9) #this holds the data from the latest iteration for joint training dummy_dataset = PlayPenContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len, env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'], buffer_size=vv['buffer_size']) train_dataset.clear() test_dataset.clear() dummy_dataset.clear() latent_dim = vv['latent_dim'] rnn_hidden_dim = vv['decoder_rnn_hidden_dim'] step_dim = obs_dim # build encoder if vv['encoder_type'] == 'mlp': encoder = GaussianNetwork(mean_network=MLP( (path_len + 1) * step_dim, latent_dim, hidden_sizes=vv['encoder_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=MLP( (path_len + 1) * step_dim, latent_dim)) elif vv['encoder_type'] == 'lstm': encoder = GaussianBidirectionalNetwork( input_dim=step_dim, hidden_dim=rnn_hidden_dim, num_layers=2, mean_network=MLP(2 * rnn_hidden_dim, latent_dim), log_var_network=MLP(2 * rnn_hidden_dim, latent_dim)) # build state decoder if vv['decoder_var_type'] == 'param': decoder_log_var_network = Parameter(latent_dim, step_dim, init=np.log(0.1)) else: decoder_log_var_network = MLP(rnn_hidden_dim, step_dim) if vv['decoder_type'] == 'grnn': decoder = GaussianRecurrentNetwork( recurrent_network=RNN( nn.LSTM(step_dim + latent_dim, rnn_hidden_dim), rnn_hidden_dim), mean_network=MLP(rnn_hidden_dim, step_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU), #log_var_network=Parameter(latent_dim, step_dim, init=np.log(0.1)), log_var_network=decoder_log_var_network, path_len=path_len, output_dim=step_dim, min_var=1e-4, ) elif vv['decoder_type'] == 'gmlp': decoder = GaussianNetwork( mean_network=MLP(latent_dim, path_len * step_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU), log_var_network=Parameter(latent_dim, path_len * step_dim, init=np.log(0.1)), min_var=1e-4) elif vv['decoder_type'] == 'mixedrnn': gauss_output_dim = 10 cat_output_dim = 5 decoder = MixedRecurrentNetwork( recurrent_network=RNN( nn.LSTM(step_dim + latent_dim, rnn_hidden_dim), rnn_hidden_dim), mean_network=MLP(rnn_hidden_dim, gauss_output_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU), prob_network=MLP(rnn_hidden_dim, cat_output_dim, final_act=nn.Softmax), log_var_network=Parameter(latent_dim, gauss_output_dim, init=np.log(0.1)), path_len=path_len, output_dim=step_dim, min_var=1e-4, gaussian_output_dim=gauss_output_dim, cat_output_dim=cat_output_dim) # policy decoder policy = CategoricalNetwork(prob_network=MLP(obs_dim + latent_dim, action_dim, hidden_sizes=(400, 300, 200), hidden_act=nn.ReLU, final_act=nn.Softmax), output_dim=action_dim) # explorer policy policy_ex = CategoricalNetwork(prob_network=MLP(obs_dim, action_dim, hidden_sizes=(400, 300, 200), hidden_act=nn.ReLU, final_act=nn.Softmax), output_dim=action_dim) # vae with behavioral cloning vae = TrajVAEBC(encoder=encoder, decoder=decoder, latent_dim=latent_dim, step_dim=step_dim, feature_dim=train_dataset.obs_dim, env=env, path_len=train_dataset.path_len, init_kl_weight=vv['kl_weight'], max_kl_weight=vv['kl_weight'], kl_mul=1.03, loss_type=vv['vae_loss_type'], lr=vv['vae_lr'], obs_dim=obs_dim, act_dim=action_dim, policy=policy, bc_weight=vv['bc_weight']) # 0 baseline due to constantly changing rewards baseline = ZeroBaseline() # policy opt for policy decoder policy_algo = PPO( env, env_name, policy, baseline=baseline, obs_dim=obs_dim, action_dim=action_dim, max_path_length=path_len, center_adv=True, optimizer=optim.Adam(policy.get_params(), vv['policy_lr'], eps=1e-5), #vv['global_lr']), use_gae=vv['use_gae'], epoch=10, ppo_batch_size=200) # baseline for the explorer baseline_ex = ZeroBaseline() # policy opt for the explorer policy_ex_algo = PPO( env, env_name, policy_ex, baseline=baseline_ex, obs_dim=obs_dim, action_dim=action_dim, max_path_length=path_len, center_adv=True, optimizer=optim.Adam(policy_ex.get_params(), vv['policy_lr'], eps=1e-5), #vv['global_lr']), use_gae=vv['use_gae'], epoch=10, ppo_batch_size=200, entropy_bonus=vv['entropy_bonus']) # for loading the model from a saved state if vv['load_models_dir'] is not None: dir = getcwd( ) + "/research/lang/traj2vecv3_jd/" + vv['load_models_dir'] itr = vv['load_models_idx'] encoder.load_state_dict(torch.load(dir + '/encoder_%d.pkl' % itr)) decoder.load_state_dict(torch.load(dir + '/decoder_%d.pkl' % itr)) policy.load_state_dict(torch.load(dir + '/policy_%d.pkl' % itr)) policy_ex.load_state_dict(torch.load(dir + '/policy_ex_%d.pkl' % itr)) vae.optimizer.load_state_dict( torch.load(dir + '/vae_optimizer_%d.pkl' % itr)) policy_algo.optimizer.load_state_dict( torch.load(dir + '/policy_optimizer_%d.pkl' % itr)) # block goals goals = 2 * np.array(vv['block_config'][1]) # reward function for MPC rf = lambda obs, rstate: reward_fn(obs, rstate, goals) # main algorithm launcher, includes mpc controller and exploration vaepd = VAEPDEntropy( env, env_name, policy, policy_ex, encoder, decoder, path_len, obs_dim, action_dim, step_dim, policy_algo, policy_ex_algo, train_dataset, latent_dim, vae, batch_size=400, block_config=vv['block_config'], plan_horizon=vv['mpc_plan'], max_horizon=vv['mpc_max'], mpc_batch=vv['mpc_batch'], rand_per_mpc_step=vv['mpc_explore_step'], mpc_explore=2048, mpc_explore_batch=6, reset_ent=vv['reset_ent'], vae_train_steps=vv['vae_train_steps'], mpc_explore_len=vv['mpc_explore_len'], consis_finetuning=vv['consis_finetuning'], true_reward_scale=vv['true_reward_scale'], discount_factor=vv['discount_factor'], reward_fn=(rf, init_rstate), ) vaepd.train(train_dataset, test_dataset=test_dataset, dummy_dataset=dummy_dataset, plot_step=10, max_itr=vv['max_itr'], record_stats=True, print_step=1000, save_step=20, start_itr=0, train_vae_after_add=vv['train_vae_after_add'], joint_training=vv['joint_training'])