parser = argparse.ArgumentParser() parser.add_argument('--saved_model', type=str, default='./logs/ddpg_test_SawyerPushAndReachArenaEnv-v0/ddpg/ddpg_s0/') parser.add_argument('--len', '-l', type=int, default=50) parser.add_argument('--episodes', '-n', type=int, default=100) parser.add_argument('--render', '-nr', action='store_false') parser.add_argument('--itr', '-i', type=int, default=-1, help='Choose iter want to run in saved folder') parser.add_argument('--deterministic', '-d', action='store_true') parser.add_argument('--use_tensorboard', action='store_true') parser.add_argument('--logdir', type=str, default='./logs/ddpg_test') parser.add_argument('--exp_name', type=str, default='evaluate') parser.add_argument('--env', type=str, default='SawyerReachXYEnv-v1') args = parser.parse_args() _, get_action = load_policy(args.saved_model, args.itr if args.itr >= 0 else 'last', args.deterministic) tensor_board = None env = SawyerReachXYZEnv( action_mode='position', position_action_scale=0.1, config_name='austri_config', reset_free=False, max_speed=0.05, fix_goal=False, fixed_goal=(0.53,0.0,0.15) ) env = FlatGoalEnv(env, append_goal_to_obs=True) env.reset() logdir_ext = os.path.join(args.logdir + '_' + args.env + '_evaluate')
def PGIRL(demonstrations=None, model=None, grad_path=None, features_idx=None, normalize_f=False, save_grad=True, opt_iters=10, compute_jacobian=False, estimate_weights=None, num_episodes=-1, pickled=False, continuous=False, num_hidden=8, num_layers=0, agent_name=None): if features_idx is None: features_idx = [0, 1, 2] logger = {} # Read or Calculate Gradient if args.read_grads: if grad_path != '': print("Reading gradients from:", grad_path) estimated_gradients = np.load(grad_path, allow_pickle=True) else: estimated_gradients = np.load(gradient_path + "estimated_gradients.npy", allow_pickle=True) estimated_gradients = estimated_gradients[:, :, features_idx] if num_episodes > 0: estimated_gradients = estimated_gradients[:num_episodes, :, :] if args.filter_gradients: estimated_gradients = filter_grads(estimated_gradients, verbose=args.verbose) else: if pickled: states_data = np.load(demonstrations + 'real_states.pkl', allow_pickle=True) actions_data = np.load(demonstrations + 'actions.pkl', allow_pickle=True) reward_data = np.load(demonstrations + 'rewards.pkl', allow_pickle=True) X_dataset = states_data[agent_name] y_dataset = actions_data[agent_name] r_dataset = reward_data[agent_name] print(np.sum(np.array(y_dataset) == 1)) input() dones_dataset = None else: # read trajectories X_dataset, y_dataset, _, _, r_dataset, dones_dataset = \ read_trajectories(demonstrations, all_columns=True, fill_size=EPISODE_LENGTH, fix_goal=True, cont_actions=args.continuous or args.lqg) if num_episodes > 0: X_dataset = X_dataset[:EPISODE_LENGTH * num_episodes] y_dataset = y_dataset[:EPISODE_LENGTH * num_episodes] r_dataset = r_dataset[:EPISODE_LENGTH * num_episodes] if dones_dataset is not None: dones_dataset = dones_dataset[:EPISODE_LENGTH * num_episodes] X_dim = len(X_dataset[0]) if continuous: y_dim = len(y_dataset[0]) else: y_dim = 2 # Create Policy linear = 'gpomdp' in model policy_train = load_policy(X_dim=X_dim, model=model, continuous=continuous, num_actions=y_dim, n_bases=X_dim, trainable_variance=args.trainable_variance, init_logstd=args.init_logstd, linear=linear, num_hidden=num_hidden, num_layers=num_layers) print('Loading dataset... done') # compute gradient estimation estimated_gradients, _ = compute_gradient( policy_train, X_dataset, y_dataset, r_dataset, dones_dataset, EPISODE_LENGTH, GAMMA, features_idx, verbose=args.verbose, use_baseline=args.baseline, use_mask=args.mask, scale_features=args.scale_features, filter_gradients=args.filter_gradients, normalize_f=normalize_f) # ================================================================================================================== if save_grad: print("Saving gradients in ", gradient_path) np.save(gradient_path + 'estimated_gradients.npy', estimated_gradients) # solve PGIRL or Rank Approx PGIRL if args.girl: weights_girl, loss_girl = solve_PGIRL(estimated_gradients, verbose=args.verbose) estimate_weights = weights_girl if args.rank_approx: weights, loss, jacobian = solve_ra_PGIRL( estimated_gradients, verbose=args.verbose, cov_estimation=args.cov_estimation, diag=args.diag, identity=args.identity, num_iters=opt_iters, compute_jacobian=compute_jacobian, other_options=[False, False, args.masked_cov]) if estimate_weights is not None or args.girl: mu, sigma = estimate_distribution_params( estimated_gradients=estimated_gradients, diag=args.diag, identity=args.identity, cov_estimation=args.cov_estimation, girl=False, other_options=[False, False, args.masked_cov]) id_matrix = np.identity(estimated_gradients.shape[1]) lf = make_loss_function(mu, sigma, id_matrix) estimated_loss = lf(estimate_weights) if compute_jacobian: print("Jacobian Rank:") print(np.linalg.matrix_rank(jacobian)) print("Jacobian s:") _, s, _ = np.linalg.svd(jacobian) print(s) else: weights, loss = solve_PGIRL(estimated_gradients, verbose=args.verbose) print("Weights:", weights) print("Loss:", loss) if args.girl: print("Weights Girl:", weights_girl) print("Loss Girl:", loss_girl) if estimate_weights is not None or args.girl: print("Loss in weights given:", estimated_loss) return logger, weights
for agent_name in agent_to_data: X_dataset = states_data[agent_name] y_dataset = actions_data[agent_name] r_dataset = reward_data[agent_name] X_dim = len(X_dataset[0]) y_dim = 2 # Create Policy model = 'bc/models/' + agent_name + '/2000_22/best' linear = 'gpomdp' in model print('load policy..') policy_train = load_policy( X_dim=X_dim, model=model, continuous=False, num_actions=y_dim, n_bases=X_dim, trainable_variance=args.trainable_variance, init_logstd=args.init_logstd, linear=linear, num_hidden=args.num_hidden, num_layers=args.num_layers) print('Loading dataset... done') # compute gradient estimation estimated_gradients, _ = compute_gradient( policy_train, X_dataset, y_dataset, r_dataset, None, args.ep_len, GAMMA,
grasp.go_to_place_position() grasp.request_grasp(grasp.msg_open) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--saved_model', type=str, default= '/home/tung/workspace/ee331_project/thanh_code/sawyer/ddpg/logs/ddpg_test_SawyerReachXYZEnv_multi_real_2/ddpg/ddpg_s0/' ) parser.add_argument('--len', '-l', type=int, default=60) parser.add_argument('--deterministic', '-d', action='store_true') parser.add_argument( '--logdir', type=str, default= '/home/tung/workspace/ee331_project/thanh_code/sawyer/ddpg/logs/ddpg_test' ) parser.add_argument('--exp_name', type=str, default='evaluate') parser.add_argument('--env', type=str, default='SawyerReachXYZenv_multi') args = parser.parse_args() # global env global get_action _, get_action = load_policy(args.saved_model, 'last', args.deterministic) main()
for i, agent in enumerate(agent_to_data.keys()): read_path = agent_to_data[agent][0] if not args.read_grads: paths = glob.glob(read_path + "/*/*trajectories.csv") for p in paths: states, actions, _, _, features, dones = \ read_trajectories(p, all_columns=True, fill_size=EPISODE_LENGTH, cont_actions=True) X_dim = len(states[0]) model_path = read_path + "/best" linear = 'gpomdp' in model_path pi = load_policy(X_dim=X_dim, model=model_path, continuous=True, num_actions=2, trainable_variance=args.trainable_variance, init_logstd=args.init_logstd, linear=linear) if args.num_episodes > 0: states = states[:EPISODE_LENGTH * args.num_episodes] actions = actions[:EPISODE_LENGTH * args.num_episodes] features = features[:EPISODE_LENGTH * args.num_episodes] dones = dones[:EPISODE_LENGTH * args.num_episodes] estimated_gradients, _ = compute_gradient( pi, states, actions, features, dones, EPISODE_LENGTH,
dx, dy = [0, 0] return dx, dy policy = input_policy if args.run_policy or args.debug_model: tf.reset_default_graph() network = mlp(num_hidden=32, num_layers=0) model = direction_to_model[args.direction] linear = 'gpomdp' in model X_dim = W * H y_dim = 2 pi = load_policy(X_dim=X_dim, model=model, continuous=True, num_actions=y_dim, n_bases=X_dim, trainable_variance=args.trainable_variance, init_logstd=args.init_logstd, linear=False) pi.load(model) def linear_policy(s): # s = env.get_state(rbf=True) logits, a, state, neglogp = pi.step(s, stochastic=True, logits=True) log("Logits: " + str(logits)) return a[0] policy_label = 'trpo' policy = linear_policy elif not args.run_policy: policy = input_policy policy_label = 'input_policy'