def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, task_name=task_name, verbose=True) runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): def get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: print(checkpoint, limit, ('limitation_' + str(limit) in checkpoint)) if ('limitation_' + str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') dataset = load_dataset(data_path) checkpoint_list = glob.glob( os.path.join('checkpoint', '*' + env_name + ".*")) log = { 'traj_limitation': [], 'upper_bound': [], 'avg_ret': [], 'avg_len': [], 'normalized_ret': [] } for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit]) / limit checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format( limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=stochastic, reuse=((i != 0) or reuse)) normalized_ret = avg_ret / upper_bound print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'. format(upper_bound, avg_ret, normalized_ret)) log['traj_limitation'].append(limit) log['upper_bound'].append(upper_bound) log['avg_ret'].append(avg_ret) log['avg_len'].append(avg_len) log['normalized_ret'].append(normalized_ret) env.close() return log
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): """ Evaluate an environment :param env_name: (str) the environment name :param seed: (int) the initial random seed :param policy_hidden_size: (int) the number of hidden neurons in the 4 layer MLP :param stochastic: (bool) use a stochastic policy :param reuse: (bool) allow reuse of the graph :param prefix: (str) the checkpoint prefix for the type ('BC' or 'gail') :return: (dict) the logging information of the evaluation """ def _get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None def _policy_fn(name, ob_space, ac_space, reuse=False, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') dataset = load_dataset(data_path) checkpoint_list = glob.glob(os.path.join('checkpoint', '*' + env_name + ".*")) log = { 'traj_limitation': [], 'upper_bound': [], 'avg_ret': [], 'avg_len': [], 'normalized_ret': [] } for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit])/limit checkpoint_dir = _get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, _policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=stochastic, reuse=((i != 0) or reuse)) normalized_ret = avg_ret/upper_bound print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'.format( upper_bound, avg_ret, normalized_ret)) log['traj_limitation'].append(limit) log['upper_bound'].append(upper_bound) log['avg_ret'].append(avg_ret) log['avg_len'].append(avg_len) log['normalized_ret'].append(normalized_ret) env.close() return log
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, verbose=False): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") # loss = tf.reduce_mean(tf.square(ac-pi.ac)) loss = tf.reduce_mean(pi.pd.neglogp(ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) U.initialize() adam.sync() if hasattr(pi, "obs_rms"): pi.obs_rms.update(dataset.obs) # update running mean/std for policy print("Update obs normalization.") logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, False) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, False) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) eval_infos = runner(env, policy_func, None, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True) logger.record_tabular("iter_so_far", iter_so_far + 1) for (key, value) in eval_infos.items(): logger.record_tabular(key, value) logger.dump_tabular()
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): def get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') dataset = load_dataset(data_path) checkpoint_list = glob.glob(os.path.join('checkpoint', '*' + env_name + ".*")) log = { 'traj_limitation': [], 'upper_bound': [], 'avg_ret': [], 'avg_len': [], 'normalized_ret': [] } for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit])/limit checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=stochastic, reuse=((i != 0) or reuse)) normalized_ret = avg_ret/upper_bound print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'.format( upper_bound, avg_ret, normalized_ret)) log['traj_limitation'].append(limit) log['upper_bound'].append(upper_bound) log['avg_ret'].append(avg_ret) log['avg_len'].append(avg_len) log['normalized_ret'].append(normalized_ret) env.close() return log
def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): def get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: if ('limitation_' + str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') dataset = load_dataset(data_path) checkpoint_list = glob.glob( os.path.join('checkpoint', '*' + env_name + ".*")) log = { 'traj_limitation': [], 'upper_bound': [], 'avg_ret': [], 'avg_len': [], 'normalized_ret': [] } for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit]) / limit checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') # FIX: with MuJoCo 1.50, MuJoCo envs are -v2 env = wrappers.Monitor( env, checkpoint_dir, force=True ) # ENHANCEMENT: Generate and save videos to checkpoint_dir # Errors with ERROR: GLEW initalization error: Missing GL version on MuJoCo 1.50, set LD_PRELOAD # https://github.com/openai/mujoco-py/issues/44 env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format( limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=stochastic, reuse=((i != 0) or reuse)) normalized_ret = avg_ret / upper_bound print('Upper bound: {}, evaluation returns: {}, normalized scores: {}'. format(upper_bound, avg_ret, normalized_ret)) log['traj_limitation'].append(limit) log['upper_bound'].append(upper_bound) log['avg_ret'].append(avg_ret) log['avg_len'].append(avg_len) log['normalized_ret'].append(normalized_ret) env.close() return log