Esempio n. 1
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--saved_model', type=str, default='./logs/ddpg_test_SawyerPushAndReachArenaEnv-v0/ddpg/ddpg_s0/')
    parser.add_argument('--len', '-l', type=int, default=50)
    parser.add_argument('--episodes', '-n', type=int, default=100)
    parser.add_argument('--render', '-nr', action='store_false')
    parser.add_argument('--itr', '-i', type=int, default=-1, help='Choose iter want to run in saved folder')
    parser.add_argument('--deterministic', '-d', action='store_true')
    parser.add_argument('--use_tensorboard', action='store_true')
    parser.add_argument('--logdir', type=str, default='./logs/ddpg_test')
    parser.add_argument('--exp_name', type=str, default='evaluate')
    parser.add_argument('--env', type=str, default='SawyerReachXYEnv-v1')
    args = parser.parse_args()

    _, get_action = load_policy(args.saved_model,
                                  args.itr if args.itr >= 0 else 'last',
                                  args.deterministic)
    tensor_board = None
    env = SawyerReachXYZEnv(
            action_mode='position',
            position_action_scale=0.1,
            config_name='austri_config',
            reset_free=False,
            max_speed=0.05,
            fix_goal=False,
            fixed_goal=(0.53,0.0,0.15)
        )

    env = FlatGoalEnv(env, append_goal_to_obs=True)
    env.reset()
    logdir_ext = os.path.join(args.logdir + '_' + args.env + '_evaluate')
Esempio n. 2
0
def PGIRL(demonstrations=None,
          model=None,
          grad_path=None,
          features_idx=None,
          normalize_f=False,
          save_grad=True,
          opt_iters=10,
          compute_jacobian=False,
          estimate_weights=None,
          num_episodes=-1,
          pickled=False,
          continuous=False,
          num_hidden=8,
          num_layers=0,
          agent_name=None):
    if features_idx is None:
        features_idx = [0, 1, 2]

    logger = {}

    # Read or Calculate Gradient
    if args.read_grads:
        if grad_path != '':
            print("Reading gradients from:", grad_path)
            estimated_gradients = np.load(grad_path, allow_pickle=True)
        else:
            estimated_gradients = np.load(gradient_path +
                                          "estimated_gradients.npy",
                                          allow_pickle=True)
        estimated_gradients = estimated_gradients[:, :, features_idx]
        if num_episodes > 0:
            estimated_gradients = estimated_gradients[:num_episodes, :, :]
        if args.filter_gradients:
            estimated_gradients = filter_grads(estimated_gradients,
                                               verbose=args.verbose)
    else:
        if pickled:
            states_data = np.load(demonstrations + 'real_states.pkl',
                                  allow_pickle=True)
            actions_data = np.load(demonstrations + 'actions.pkl',
                                   allow_pickle=True)
            reward_data = np.load(demonstrations + 'rewards.pkl',
                                  allow_pickle=True)
            X_dataset = states_data[agent_name]
            y_dataset = actions_data[agent_name]
            r_dataset = reward_data[agent_name]
            print(np.sum(np.array(y_dataset) == 1))
            input()

            dones_dataset = None
        else:
            # read trajectories
            X_dataset, y_dataset, _, _, r_dataset, dones_dataset = \
                read_trajectories(demonstrations, all_columns=True,
                                  fill_size=EPISODE_LENGTH,
                                  fix_goal=True,
                                  cont_actions=args.continuous or args.lqg)
        if num_episodes > 0:
            X_dataset = X_dataset[:EPISODE_LENGTH * num_episodes]
            y_dataset = y_dataset[:EPISODE_LENGTH * num_episodes]
            r_dataset = r_dataset[:EPISODE_LENGTH * num_episodes]

            if dones_dataset is not None:
                dones_dataset = dones_dataset[:EPISODE_LENGTH * num_episodes]

        X_dim = len(X_dataset[0])
        if continuous:
            y_dim = len(y_dataset[0])
        else:
            y_dim = 2
        # Create Policy
        linear = 'gpomdp' in model

        policy_train = load_policy(X_dim=X_dim,
                                   model=model,
                                   continuous=continuous,
                                   num_actions=y_dim,
                                   n_bases=X_dim,
                                   trainable_variance=args.trainable_variance,
                                   init_logstd=args.init_logstd,
                                   linear=linear,
                                   num_hidden=num_hidden,
                                   num_layers=num_layers)
        print('Loading dataset... done')
        # compute gradient estimation

        estimated_gradients, _ = compute_gradient(
            policy_train,
            X_dataset,
            y_dataset,
            r_dataset,
            dones_dataset,
            EPISODE_LENGTH,
            GAMMA,
            features_idx,
            verbose=args.verbose,
            use_baseline=args.baseline,
            use_mask=args.mask,
            scale_features=args.scale_features,
            filter_gradients=args.filter_gradients,
            normalize_f=normalize_f)
    # ==================================================================================================================

    if save_grad:
        print("Saving gradients in ", gradient_path)
        np.save(gradient_path + 'estimated_gradients.npy', estimated_gradients)

    # solve PGIRL or Rank Approx PGIRL
    if args.girl:
        weights_girl, loss_girl = solve_PGIRL(estimated_gradients,
                                              verbose=args.verbose)
        estimate_weights = weights_girl
    if args.rank_approx:
        weights, loss, jacobian = solve_ra_PGIRL(
            estimated_gradients,
            verbose=args.verbose,
            cov_estimation=args.cov_estimation,
            diag=args.diag,
            identity=args.identity,
            num_iters=opt_iters,
            compute_jacobian=compute_jacobian,
            other_options=[False, False, args.masked_cov])
        if estimate_weights is not None or args.girl:
            mu, sigma = estimate_distribution_params(
                estimated_gradients=estimated_gradients,
                diag=args.diag,
                identity=args.identity,
                cov_estimation=args.cov_estimation,
                girl=False,
                other_options=[False, False, args.masked_cov])

            id_matrix = np.identity(estimated_gradients.shape[1])
            lf = make_loss_function(mu, sigma, id_matrix)
            estimated_loss = lf(estimate_weights)

        if compute_jacobian:
            print("Jacobian Rank:")
            print(np.linalg.matrix_rank(jacobian))
            print("Jacobian s:")
            _, s, _ = np.linalg.svd(jacobian)
            print(s)

    else:
        weights, loss = solve_PGIRL(estimated_gradients, verbose=args.verbose)

    print("Weights:", weights)
    print("Loss:", loss)
    if args.girl:
        print("Weights Girl:", weights_girl)
        print("Loss Girl:", loss_girl)
    if estimate_weights is not None or args.girl:
        print("Loss in weights given:", estimated_loss)
    return logger, weights
 for agent_name in agent_to_data:
     X_dataset = states_data[agent_name]
     y_dataset = actions_data[agent_name]
     r_dataset = reward_data[agent_name]
     X_dim = len(X_dataset[0])
     y_dim = 2
     # Create Policy
     model = 'bc/models/' + agent_name + '/2000_22/best'
     linear = 'gpomdp' in model
     print('load policy..')
     policy_train = load_policy(
         X_dim=X_dim,
         model=model,
         continuous=False,
         num_actions=y_dim,
         n_bases=X_dim,
         trainable_variance=args.trainable_variance,
         init_logstd=args.init_logstd,
         linear=linear,
         num_hidden=args.num_hidden,
         num_layers=args.num_layers)
     print('Loading dataset... done')
     # compute gradient estimation
     estimated_gradients, _ = compute_gradient(
         policy_train,
         X_dataset,
         y_dataset,
         r_dataset,
         None,
         args.ep_len,
         GAMMA,
    grasp.go_to_place_position()

    grasp.request_grasp(grasp.msg_open)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--saved_model',
        type=str,
        default=
        '/home/tung/workspace/ee331_project/thanh_code/sawyer/ddpg/logs/ddpg_test_SawyerReachXYZEnv_multi_real_2/ddpg/ddpg_s0/'
    )
    parser.add_argument('--len', '-l', type=int, default=60)
    parser.add_argument('--deterministic', '-d', action='store_true')
    parser.add_argument(
        '--logdir',
        type=str,
        default=
        '/home/tung/workspace/ee331_project/thanh_code/sawyer/ddpg/logs/ddpg_test'
    )
    parser.add_argument('--exp_name', type=str, default='evaluate')
    parser.add_argument('--env', type=str, default='SawyerReachXYZenv_multi')
    args = parser.parse_args()

    # global env
    global get_action
    _, get_action = load_policy(args.saved_model, 'last', args.deterministic)

    main()
Esempio n. 5
0
 for i, agent in enumerate(agent_to_data.keys()):
     read_path = agent_to_data[agent][0]
     if not args.read_grads:
         paths = glob.glob(read_path + "/*/*trajectories.csv")
         for p in paths:
             states, actions, _, _, features, dones = \
                 read_trajectories(p, all_columns=True,
                                   fill_size=EPISODE_LENGTH,
                                   cont_actions=True)
             X_dim = len(states[0])
             model_path = read_path + "/best"
             linear = 'gpomdp' in model_path
             pi = load_policy(X_dim=X_dim,
                              model=model_path,
                              continuous=True,
                              num_actions=2,
                              trainable_variance=args.trainable_variance,
                              init_logstd=args.init_logstd,
                              linear=linear)
             if args.num_episodes > 0:
                 states = states[:EPISODE_LENGTH * args.num_episodes]
                 actions = actions[:EPISODE_LENGTH * args.num_episodes]
                 features = features[:EPISODE_LENGTH * args.num_episodes]
                 dones = dones[:EPISODE_LENGTH * args.num_episodes]
             estimated_gradients, _ = compute_gradient(
                 pi,
                 states,
                 actions,
                 features,
                 dones,
                 EPISODE_LENGTH,
            dx, dy = [0, 0]
        return dx, dy


    policy = input_policy
    if args.run_policy or args.debug_model:

        tf.reset_default_graph()
        network = mlp(num_hidden=32, num_layers=0)

        model = direction_to_model[args.direction]
        linear = 'gpomdp' in model
        X_dim = W * H
        y_dim = 2
        pi = load_policy(X_dim=X_dim, model=model, continuous=True, num_actions=y_dim, n_bases=X_dim,
                                   trainable_variance=args.trainable_variance, init_logstd=args.init_logstd,
                                   linear=False)
        pi.load(model)

        def linear_policy(s):
            # s = env.get_state(rbf=True)
            logits, a, state, neglogp = pi.step(s, stochastic=True, logits=True)
            log("Logits: " + str(logits))
            return a[0]


        policy_label = 'trpo'
        policy = linear_policy
    elif not args.run_policy:
        policy = input_policy
        policy_label = 'input_policy'