Beispiel #1
0
def run_task(vv):
    set_gpu_mode(vv['gpu'])
    env_name = vv['env_name']
    env = make_env(env_name, 1, 0, '/tmp/gym', kwargs=dict(border=vv['block_config'][2]))
    obs_dim = int(env().observation_space.shape[0])
    action_dim = int(env().action_space.n)

    path_len = vv['path_len']
    data_path = None
    # True so that behavioral cloning has access to actions
    use_actions = True

    #create a dummy datset since we initialize with no data
    dummy = np.zeros((1, path_len+1, obs_dim + action_dim))
    train_data, test_data = dummy, dummy

    train_dataset = PlayPenContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len,
                          env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'],
                                      buffer_size=vv['buffer_size'])
    #validation set for vae training
    test_dataset = PlayPenContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len,
                          env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size']//9,
                                      buffer_size=vv['buffer_size']//9)

    #this holds the data from the latest iteration for joint training
    dummy_dataset = PlayPenContDataset(data_path=data_path, raw_data=train_data, obs_dim=obs_dim, action_dim=action_dim, path_len=path_len,
                          env_id='Playpen', normalize=False, use_actions=use_actions, batch_size=vv['batch_size'],
                                      buffer_size=vv['buffer_size'])

    train_dataset.clear()
    test_dataset.clear()
    dummy_dataset.clear()

    latent_dim = vv['latent_dim']
    rnn_hidden_dim = vv['decoder_rnn_hidden_dim']

    step_dim = obs_dim

    # build encoder
    if vv['encoder_type'] == 'mlp':
        encoder = GaussianNetwork(
            mean_network=MLP((path_len+1)*step_dim, latent_dim, hidden_sizes=vv['encoder_hidden_sizes'], hidden_act=nn.ReLU),
            log_var_network=MLP((path_len+1)*step_dim, latent_dim)
        )
    elif  vv['encoder_type'] == 'lstm':
        encoder = GaussianBidirectionalNetwork(
            input_dim=step_dim,
            hidden_dim=rnn_hidden_dim,
            num_layers=2,
            mean_network=MLP(2 * rnn_hidden_dim, latent_dim),
            log_var_network=MLP(2 * rnn_hidden_dim, latent_dim)
        )

    # build state decoder
    if vv['decoder_var_type'] == 'param':
        decoder_log_var_network = Parameter(latent_dim, step_dim, init=np.log(0.1))
    else:
        decoder_log_var_network = MLP(rnn_hidden_dim, step_dim)
    if vv['decoder_type'] == 'grnn':
        decoder = GaussianRecurrentNetwork(
            recurrent_network=RNN(nn.LSTM(step_dim + latent_dim, rnn_hidden_dim), rnn_hidden_dim),
            mean_network=MLP(rnn_hidden_dim, step_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU),
            #log_var_network=Parameter(latent_dim, step_dim, init=np.log(0.1)),
            log_var_network=decoder_log_var_network,
            path_len=path_len,
            output_dim=step_dim,
            min_var=1e-4,
        )
    elif vv['decoder_type'] == 'gmlp':
        decoder = GaussianNetwork(
            mean_network=MLP(latent_dim, path_len*step_dim, hidden_sizes=vv['decoder_hidden_sizes'],
                             hidden_act=nn.ReLU),
            log_var_network=Parameter(latent_dim, path_len*step_dim, init=np.log(0.1)),
            min_var=1e-4
        )
    elif vv['decoder_type'] == 'mixedrnn':
        gauss_output_dim = 10
        cat_output_dim = 5
        decoder = MixedRecurrentNetwork(
            recurrent_network=RNN(nn.LSTM(step_dim + latent_dim, rnn_hidden_dim), rnn_hidden_dim),
            mean_network=MLP(rnn_hidden_dim, gauss_output_dim, hidden_sizes=vv['decoder_hidden_sizes'], hidden_act=nn.ReLU),
            prob_network=MLP(rnn_hidden_dim, cat_output_dim, final_act=nn.Softmax),
            log_var_network=Parameter(latent_dim, gauss_output_dim, init=np.log(0.1)),
            path_len=path_len,
            output_dim=step_dim,
            min_var=1e-4,
            gaussian_output_dim=gauss_output_dim,
            cat_output_dim=cat_output_dim
        )

    # policy decoder
    policy = CategoricalNetwork(
        prob_network=MLP(obs_dim+ latent_dim, action_dim, hidden_sizes=(400, 300, 200),
                         hidden_act=nn.ReLU, final_act=nn.Softmax),
        output_dim=action_dim
    )

    # explorer policy
    policy_ex = CategoricalNetwork(
        prob_network=MLP(obs_dim, action_dim, hidden_sizes=(400, 300, 200),
                         hidden_act=nn.ReLU, final_act=nn.Softmax),
        output_dim=action_dim
    )

    # vae with behavioral cloning
    vae = TrajVAEBC(encoder=encoder, decoder=decoder, latent_dim=latent_dim, step_dim=step_dim,
                  feature_dim=train_dataset.obs_dim, env=env, path_len=train_dataset.path_len,
                  init_kl_weight=vv['kl_weight'], max_kl_weight=vv['kl_weight'], kl_mul=1.03,
                  loss_type=vv['vae_loss_type'], lr=vv['vae_lr'], obs_dim=obs_dim,
                  act_dim=action_dim, policy=policy, bc_weight=vv['bc_weight'])

    # 0 baseline due to constantly changing rewards
    baseline = ZeroBaseline()
    # policy opt for policy decoder
    policy_algo = PPO(env, env_name, policy, baseline=baseline, obs_dim=obs_dim,
                             action_dim=action_dim, max_path_length=path_len, center_adv=True,
                     optimizer=optim.Adam(policy.get_params(), vv['policy_lr'], eps=1e-5), #vv['global_lr']),
                      use_gae=vv['use_gae'], epoch=10, ppo_batch_size=200)

    # baseline for the explorer
    baseline_ex = ZeroBaseline()
    # policy opt for the explorer
    policy_ex_algo = PPO(env, env_name, policy_ex, baseline=baseline_ex, obs_dim=obs_dim,
                             action_dim=action_dim, max_path_length=path_len, center_adv=True,
                     optimizer=optim.Adam(policy_ex.get_params(), vv['policy_lr'], eps=1e-5), #vv['global_lr']),
                      use_gae=vv['use_gae'], epoch=10, ppo_batch_size=200,
                      entropy_bonus = vv['entropy_bonus'])

    # for loading the model from a saved state
    if vv['load_models_dir'] is not None:
        dir = getcwd() + "/research/lang/traj2vecv3_jd/" + vv['load_models_dir']
        itr = vv['load_models_idx']
        encoder.load_state_dict(torch.load(dir + '/encoder_%d.pkl' % itr))
        decoder.load_state_dict(torch.load(dir + '/decoder_%d.pkl' % itr))
        policy.load_state_dict(torch.load(dir + '/policy_%d.pkl' % itr))
        policy_ex.load_state_dict(torch.load(dir + '/policy_ex_%d.pkl' % itr))
        vae.optimizer.load_state_dict(torch.load(dir + '/vae_optimizer_%d.pkl' % itr))
        policy_algo.optimizer.load_state_dict(torch.load(dir + '/policy_optimizer_%d.pkl' % itr))

    # block goals
    goals = np.array(vv['block_config'][1])
    # reward function for MPC
    rf = lambda obs, rstate: reward_fn(obs, rstate, goals)

    # main algorithm launcher, includes mpc controller and exploration
    vaepd = VAEPDEntropy(env, env_name, policy, policy_ex, encoder, decoder,
        path_len, obs_dim, action_dim, step_dim, policy_algo, policy_ex_algo,
                  train_dataset, latent_dim, vae,
                  batch_size=400,
                  block_config=vv['block_config'],
                  plan_horizon = vv['mpc_plan'],
                  max_horizon = vv['mpc_max'],
                  mpc_batch = vv['mpc_batch'],
                  rand_per_mpc_step = vv['mpc_explore_step'],
                  mpc_explore = 2048, 
                  mpc_explore_batch = 6,
                  reset_ent = vv['reset_ent'],
                  vae_train_steps = vv['vae_train_steps'],
                  mpc_explore_len=vv['mpc_explore_len'],
                  consis_finetuning = vv['consis_finetuning'],
                  true_reward_scale=vv['true_reward_scale'],
                  discount_factor=vv['discount_factor'],
                  reward_fn=(rf, init_rstate),
                  )


    vaepd.train(train_dataset, test_dataset=test_dataset, dummy_dataset=dummy_dataset, plot_step=10, max_itr=vv['max_itr'], record_stats=True, print_step=1000,
                             save_step=20,
               start_itr=0, train_vae_after_add=vv['train_vae_after_add'],
                joint_training=vv['joint_training'])
Beispiel #2
0
def run_task(vv):
    set_gpu_mode(vv['gpu'])
    env_name = None
    env = lambda: WheeledEnv()

    obs_dim = int(env().observation_space.shape[0])
    action_dim = int(env().action_space.shape[0])
    vv['block_config'] = [env().reset().tolist(), vv['goals']]
    print(vv['block_config'])

    path_len = vv['path_len']
    data_path = vv['initial_data_path']

    use_actions = vv['use_actions']

    dummy = np.zeros((1, path_len + 1, obs_dim + action_dim))
    train_data, test_data = dummy, dummy

    train_dataset = WheeledContDataset(data_path=data_path,
                                       raw_data=train_data,
                                       obs_dim=obs_dim,
                                       action_dim=action_dim,
                                       path_len=path_len,
                                       env_id='Playpen',
                                       normalize=False,
                                       use_actions=use_actions,
                                       batch_size=vv['batch_size'],
                                       buffer_size=vv['buffer_size'])
    test_dataset = WheeledContDataset(data_path=data_path,
                                      raw_data=train_data,
                                      obs_dim=obs_dim,
                                      action_dim=action_dim,
                                      path_len=path_len,
                                      env_id='Playpen',
                                      normalize=False,
                                      use_actions=use_actions,
                                      batch_size=vv['batch_size'] // 9,
                                      buffer_size=vv['buffer_size'] // 9)
    dummy_dataset = WheeledContDataset(data_path=data_path,
                                       raw_data=train_data,
                                       obs_dim=obs_dim,
                                       action_dim=action_dim,
                                       path_len=path_len,
                                       env_id='Playpen',
                                       normalize=False,
                                       use_actions=use_actions,
                                       batch_size=vv['batch_size'],
                                       buffer_size=vv['buffer_size'])

    train_dataset.clear()
    test_dataset.clear()
    dummy_dataset.clear()

    latent_dim = vv['latent_dim']
    policy_rnn_hidden_dim = vv['policy_rnn_hidden_dim']
    rnn_hidden_dim = vv['decoder_rnn_hidden_dim']

    step_dim = obs_dim

    rnn_hidden_dim = 256
    if vv['encoder_type'] == 'mlp':
        encoder = GaussianNetwork(mean_network=MLP(
            (path_len + 1) * step_dim,
            latent_dim,
            hidden_sizes=vv['encoder_hidden_sizes'],
            hidden_act=nn.ReLU),
                                  log_var_network=MLP(
                                      (path_len + 1) * step_dim, latent_dim))
    elif vv['encoder_type'] == 'lstm':
        encoder = GaussianBidirectionalNetwork(
            input_dim=step_dim,
            hidden_dim=rnn_hidden_dim,
            num_layers=2,
            mean_network=MLP(2 * rnn_hidden_dim, latent_dim),
            log_var_network=MLP(2 * rnn_hidden_dim, latent_dim))

    if vv['decoder_var_type'] == 'param':
        decoder_log_var_network = Parameter(latent_dim,
                                            step_dim,
                                            init=np.log(0.1))
    else:
        decoder_log_var_network = MLP(rnn_hidden_dim, step_dim)
    if vv['decoder_type'] == 'grnn':
        decoder = GaussianRecurrentNetwork(
            recurrent_network=RNN(
                nn.LSTM(step_dim + latent_dim, rnn_hidden_dim),
                rnn_hidden_dim),
            mean_network=MLP(rnn_hidden_dim,
                             step_dim,
                             hidden_sizes=vv['decoder_hidden_sizes'],
                             hidden_act=nn.ReLU),
            log_var_network=decoder_log_var_network,
            path_len=path_len,
            output_dim=step_dim,
            min_var=1e-4,
        )
    elif vv['decoder_type'] == 'gmlp':
        decoder = GaussianNetwork(
            mean_network=MLP(latent_dim,
                             path_len * step_dim,
                             hidden_sizes=vv['decoder_hidden_sizes'],
                             hidden_act=nn.ReLU),
            log_var_network=Parameter(latent_dim,
                                      path_len * step_dim,
                                      init=np.log(0.1)),
            min_var=1e-4)
    elif vv['decoder_type'] == 'mixedrnn':
        gauss_output_dim = 10
        cat_output_dim = 5
        decoder = MixedRecurrentNetwork(
            recurrent_network=RNN(
                nn.LSTM(step_dim + latent_dim, rnn_hidden_dim),
                rnn_hidden_dim),
            mean_network=MLP(rnn_hidden_dim,
                             gauss_output_dim,
                             hidden_sizes=vv['decoder_hidden_sizes'],
                             hidden_act=nn.ReLU),
            prob_network=MLP(rnn_hidden_dim,
                             cat_output_dim,
                             final_act=nn.Softmax),
            log_var_network=Parameter(latent_dim,
                                      gauss_output_dim,
                                      init=np.log(0.1)),
            path_len=path_len,
            output_dim=step_dim,
            min_var=1e-4,
            gaussian_output_dim=gauss_output_dim,
            cat_output_dim=cat_output_dim)

    # Policy
    if vv['policy_type'] == 'grnn':
        policy = GaussianRecurrentPolicy(
            recurrent_network=RNN(
                nn.LSTM(obs_dim + latent_dim, policy_rnn_hidden_dim),
                policy_rnn_hidden_dim),
            mean_network=MLP(policy_rnn_hidden_dim,
                             action_dim,
                             hidden_act=nn.ReLU),
            log_var_network=Parameter(obs_dim + latent_dim,
                                      action_dim,
                                      init=np.log(1)),
            path_len=path_len,
            output_dim=action_dim)

    elif vv['policy_type'] == 'gmlp':
        policy = GaussianNetwork(
            mean_network=MLP(obs_dim + latent_dim,
                             action_dim,
                             hidden_sizes=vv['policy_hidden_sizes'],
                             hidden_act=nn.ReLU),
            log_var_network=Parameter(obs_dim + latent_dim,
                                      action_dim,
                                      init=np.log(1)))
        policy_ex = GaussianNetwork(mean_network=MLP(
            obs_dim,
            action_dim,
            hidden_sizes=vv['policy_hidden_sizes'],
            hidden_act=nn.ReLU),
                                    log_var_network=Parameter(obs_dim,
                                                              action_dim,
                                                              init=np.log(20)))
    elif vv['policy_type'] == 'crnn':
        policy = RecurrentCategoricalPolicy(
            recurrent_network=RNN(
                nn.LSTM(obs_dim + latent_dim, policy_rnn_hidden_dim),
                policy_rnn_hidden_dim),
            prob_network=MLP(policy_rnn_hidden_dim,
                             action_dim,
                             hidden_sizes=vv['policy_hidden_sizes'],
                             final_act=nn.Softmax),
            path_len=path_len,
            output_dim=action_dim)
    elif vv['policy_type'] == 'cmlp':
        policy = CategoricalNetwork(prob_network=MLP(obs_dim + latent_dim,
                                                     action_dim,
                                                     hidden_sizes=(400, 300,
                                                                   200),
                                                     hidden_act=nn.ReLU,
                                                     final_act=nn.Softmax),
                                    output_dim=action_dim)
        policy_ex = CategoricalNetwork(prob_network=MLP(obs_dim,
                                                        action_dim,
                                                        hidden_sizes=(400, 300,
                                                                      200),
                                                        hidden_act=nn.ReLU,
                                                        final_act=nn.Softmax),
                                       output_dim=action_dim)
    elif vv['policy_type'] == 'lstm':
        policy = LSTMPolicy(input_dim=obs_dim + latent_dim,
                            hidden_dim=rnn_hidden_dim,
                            num_layers=2,
                            output_dim=action_dim)

    vae = TrajVAEBC(encoder=encoder,
                    decoder=decoder,
                    latent_dim=latent_dim,
                    step_dim=step_dim,
                    feature_dim=train_dataset.obs_dim,
                    env=env,
                    path_len=train_dataset.path_len,
                    init_kl_weight=vv['kl_weight'],
                    max_kl_weight=vv['kl_weight'],
                    kl_mul=1.03,
                    loss_type=vv['vae_loss_type'],
                    lr=vv['vae_lr'],
                    obs_dim=obs_dim,
                    act_dim=action_dim,
                    policy=policy,
                    bc_weight=vv['bc_weight'])

    baseline = ZeroBaseline()
    policy_algo = PPO(
        env,
        env_name,
        policy,
        baseline=baseline,
        obs_dim=obs_dim,
        action_dim=action_dim,
        max_path_length=path_len,
        center_adv=True,
        optimizer=optim.Adam(policy.get_params(), vv['policy_lr'],
                             eps=1e-5),  #vv['global_lr']),
        use_gae=vv['use_gae'],
        epoch=10,
        ppo_batch_size=200)

    baseline_ex = ZeroBaseline()
    policy_ex_algo = PPO(
        env,
        env_name,
        policy_ex,
        baseline=baseline_ex,
        obs_dim=obs_dim,
        action_dim=action_dim,
        max_path_length=path_len,
        center_adv=True,
        optimizer=optim.Adam(policy_ex.get_params(), vv['policy_lr'],
                             eps=1e-5),  #vv['global_lr']),
        use_gae=vv['use_gae'],
        epoch=10,
        ppo_batch_size=200,
        entropy_bonus=vv['entropy_bonus'])

    if vv['load_models_dir'] is not None:
        dir = getcwd(
        ) + "/research/lang/traj2vecv3_jd/" + vv['load_models_dir']
        itr = vv['load_models_idx']
        encoder.load_state_dict(torch.load(dir + '/encoder_%d.pkl' % itr))
        decoder.load_state_dict(torch.load(dir + '/decoder_%d.pkl' % itr))
        policy.load_state_dict(torch.load(dir + '/policy_%d.pkl' % itr))
        policy_ex.load_state_dict(torch.load(dir + '/policy_ex_%d.pkl' % itr))
        vae.optimizer.load_state_dict(
            torch.load(dir + '/vae_optimizer_%d.pkl' % itr))
        policy_algo.optimizer.load_state_dict(
            torch.load(dir + '/policy_optimizer_%d.pkl' % itr))

    goals = np.array(vv['block_config'][1])
    rf = lambda obs, rstate: reward_fn(obs, rstate, goals)

    vaepd = VAEPDEntropy(env,
                         env_name,
                         policy,
                         policy_ex,
                         encoder,
                         decoder,
                         path_len,
                         obs_dim,
                         action_dim,
                         step_dim,
                         policy_algo,
                         policy_ex_algo,
                         train_dataset,
                         latent_dim,
                         vae,
                         batch_size=400,
                         block_config=vv['block_config'],
                         plan_horizon=vv['mpc_plan'],
                         max_horizon=vv['mpc_max'],
                         mpc_batch=vv['mpc_batch'],
                         rand_per_mpc_step=vv['mpc_explore_step'],
                         mpc_explore=2048,
                         mpc_explore_batch=3,
                         reset_ent=vv['reset_ent'],
                         vae_train_steps=vv['vae_train_steps'],
                         mpc_explore_len=vv['mpc_explore_len'],
                         true_reward_scale=vv['true_reward_scale'],
                         discount_factor=vv['discount_factor'],
                         reward_fn=(rf, init_rstate))

    vaepd.train(train_dataset,
                test_dataset=test_dataset,
                dummy_dataset=dummy_dataset,
                plot_step=10,
                max_itr=vv['max_itr'],
                record_stats=True,
                print_step=1000,
                save_step=20,
                start_itr=0,
                train_vae_after_add=vv['train_vae_after_add'],
                joint_training=vv['joint_training'])