def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1)
def policy_fn(name, ob_space, ac_space, noisy_nets=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, noisy_nets=noisy_nets)
def policy_fn(name, ob_space, ac_space): #return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2)
def policy_fn(name, ob_space, ac_space): # TODO Ensure that multiple-layers implementation is really solid return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=2)
def policy_fn(name, ob_space, ac_space): from baselines.ppo1 import mlp_policy return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2)
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, activation=activation, interpolate=interpolate)
def test(env_id, num_episodes, model_path, seed): from baselines.ppo1 import mlp_policy U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) pi = mlp_policy.MlpPolicy(name='pi', ob_space=env.observation_space, ac_space=env.action_space, hid_size=64, num_hid_layers=2) pi_vars = pi.get_variables() for v in pi_vars: print(v.name) saveFromFlat(pi.get_variables(), model_path) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) ep_rews = [] ob = env.reset() for _ in tqdm(range(num_episodes)): ep_rew = 0 new = False while not new: env.render() ac, vpred = pi.act(stochastic=False, ob=ob) ob, rew, new, _ = env.step(ac) ep_rew += rew ob = env.reset() ep_rews.append(ep_rew) print("----------- Summary ------------") print("episode mean %.3f" % np.mean(ep_rews)) env.close()
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=pi_hid_size, num_hid_layers=pi_num_hid_layers)
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, policy_hid_size=policy_hid_size, vf_hid_size=vf_hid_size, activation_policy=activation_policy, activation_vf=activation_vf)
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, layers_val=layers_val, layers_pol=layers_pol, gaussian_fixed_var=False, dist=distribution)
def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, )
def policy_fn(name, ob_space, ac_space): # mlp: Multi-Layer Perceptron # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value) return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2)
def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, sess=sess, placeholders=placeholders)
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc)
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, ## MAIN CHANGES hid_size_V=vf_hid_size, hid_size_actor=64, num_hid_layers=2, V_keep_prob=V_keep_prob,mc_samples=mc_samples,\ layer_norm=False,activation_critic=activation_vf,\ activation_actor=tf.nn.relu , dropout_on_V=dropout_on_V, sample_dropout=sample_dropout)
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=args.num_options, dc=args.dc, head=args.model_type)
def test_conversion(config_dict): expert_policy_config = config_dict.model.expert_policy name = '{0}__{1}'.format(config_dict.env.name, expert_policy_config.name) model_file_tf = os.path.join(expert_policy_config.save_dir, '{0}.ckpt'.format(name)) model_file_th = os.path.join(expert_policy_config.save_dir, '{0}.th.pt'.format(name)) env = make_env(config_dict.env.name, config_dict.general.seed) pi_tf = mlp_policy.MlpPolicy( name='pi', ob_space=env.observation_space, ac_space=env.action_space, hid_size=expert_policy_config.hidden_size, num_hid_layers=expert_policy_config.num_layers) observations_tf = [] with U.make_session(num_cpu=expert_policy_config.num_cpu) as sess: # Load TF model saver = tf.train.Saver(pi_tf.get_variables()) saver.restore(tf.get_default_session(), model_file_tf) # Sample trajectory # env.seed(config_dict.general.seed) observation, done = env.reset(), False observations_tf.append(observation) while not done: action = pi_tf.act(stochastic=False, ob=observation)[0] observation, _, done, _ = env.step(action) observations_tf.append(observation) pi_th = NormalMLPPolicy(int(np.prod(env.observation_space.shape)), int(np.prod(env.action_space.shape)), expert_policy_config.hidden_size, expert_policy_config.num_layers, nonlinearity=nn.Tanh) observations_th = [] # Load Pytorch model with open(model_file_th, 'rb') as f: state_dict = torch.load(f) pi_th.load_state_dict(state_dict) # Sample trajectory env.seed(config_dict.general.seed) observation, done = env.reset(), False observations_th.append(observation) while not done: observation_tensor = torch.from_numpy(observation).unsqueeze(0).float() action_tensor = pi_th(observation_tensor).mean[0] action = action_tensor.detach().cpu().numpy() observation, _, done, _ = env.step(action) observations_th.append(observation) # Compare the trajectories linf_norm = np.max( np.abs(np.asarray(observations_tf) - np.asarray(observations_th))) print('Maximum absolute difference between observations: {0}'.format( linf_norm))
def policy_fn(name, ob_space, ac_space): print("Policy with name: ", name) policy = mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hidden_dimension_list=hidden_dimensions) saver = tf.train.Saver() if initial_params_path is not None: saver.restore(sess, initial_params_path) return policy
def __init__(self, param_path, obs_space, action_space, hid_size, num_hid_layers): self.action_space = action_space self.actor = mlp_policy.MlpPolicy("pi", obs_space, action_space, hid_size=hid_size, num_hid_layers=num_hid_layers) U.initialize() saver = tf.train.Saver() saver.restore(tf.get_default_session(), param_path)
def __init__(self, env, sess, restore, batch=TRAINING_BATCH_SIZE): self.pi = mlp_policy.MlpPolicy(name='pi', ob_space=env.observation_space, ac_space=env.action_space, hid_size=64, num_hid_layers=2, training_batch_size=batch) self.saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='pi')) if restore: self.saver.restore(sess, "{0}/teacher.ckpt".format(base_path))
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_sizes=config['hidden_layers'], num_hid_layers=len(config['hidden_layers']), gaussian_fixed_var=True, init_pol_weight_stddev=config['init_pol_weight_stddev'], init_val_weight_stddev=config['init_val_weight_stddev'], init_logstd=config['init_logstd'])
def policy_fn(name, ob_space, ac_space): if state_self_standardize: return mlp_norms_policy.MlpNormsPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hsize, num_hid_layers=layers, gmm_comp=1) else: return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hsize, num_hid_layers=layers, gmm_comp=1)
def __init__(self, env, sess, restore, klts): self.pi = mlp_policy.MlpPolicy( name="s_pi_{0}".format("klts" if klts else "klst"), ob_space=env.observation_space, ac_space=env.action_space, hid_size=64, num_hid_layers=2, training_batch_size=TRAINING_BATCH_SIZE, gaussian_fixed_var=False) self.saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="s_pi_{0}".format("klts" if klts else "klst"))) if restore: self.saver.restore( sess, "{0}/student_{1}.ckpt".format(base_path, "klts" if klts else "klst"))
def load_episodes(env_id, seed, model_files): with tf.device('/cpu'): sess = U.make_session(num_cpu=1) sess.__enter__() env = gym.make(env_id) env.seed(seed) # TODO set max episode length env._max_episode_steps = EPISODE_MAX_LENGTH gym.logger.setLevel(logging.WARN) policy_fn = lambda name, ob_space, ac_space: mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) pi = policy_fn('pi', env.observation_space, env.action_space) render = RENDER from time import ctime start_time = ctime() for model_file in tqdm(model_files): # TODO adjust velocity env.unwrapped.metadata['target_v'] = 0.1 time_step = int(model_file[-9:]) observations, cum_reward, distance, cum_rew_p = run_environment_episode( env, pi, seed, model_file, env._max_episode_steps, render=render, stochastic=False) save_full_episodes(observations, time_step, distance, cum_reward, cum_rew_p) print(start_time) print(ctime())
def policy_fn(name, ob_space, ac_space): if policy == "sigmoid": return sigmoid_policy.SigmoidPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=3) elif policy == "mlp": return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=3) elif policy == "beta": return beta_policy.BetaPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=3)
all_episodes_rew_pc = [] for item in paths_with_var_scops: print(list(item.values())) var_scope, path = list(item.values())[0] with tf.variable_scope(str(var_scope)): sess = U.make_session(num_cpu=1) sess.__enter__() policy_fn = lambda name, ob_space, ac_space: mlp_policy.MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) pi = policy_fn('pi', env.observation_space, env.action_space) gym.logger.setLevel(logging.WARN) model_file = get_latest_model_file(path) distance_rew = 0 rew_p = 0 for s in range(configs["runs_per_model"]): single_episode_distance, single_episode_rew_p = run_environment_episode(
def policy_fn(name, ob_space, ac_space, params=params): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=int(params[1]), num_hid_layers=int(params[2]))
def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, gaussian_fixed_var=True)
def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=64, num_hid_layers=2)
def main(logdir, checkpoint, human_render, num_rollouts, max_episode_length, save_videos, save_rollouts, save_separate_rollouts): if not osp.exists(osp.join(logdir, 'run.json')): raise FileNotFoundError("Could not find run.json.") configuration = json.load(open(osp.join(logdir, 'run.json'), 'r')) if configuration["settings"]["method"] not in ["trpo", "ppo"]: raise NotImplementedError( "Playback for %s has not been implemented yet." % configuration["method"]) env = utils.create_environment(configuration["settings"]["environment"]) # build policy network # TODO this needs to be more general from baselines.ppo1 import mlp_policy tf.Session().__enter__() pi = mlp_policy.MlpPolicy( name="pi", ob_space=env.observation_space, ac_space=env.action_space, hid_size=configuration["settings"].get('pi_hid_size', 150), num_hid_layers=configuration["settings"].get('pi_num_hid_layers', 3)) # find latest policy checkpoint saver = tf.train.Saver() if checkpoint is None: files = glob.glob(osp.join(logdir, 'checkpoints') + '/*.index') files = [(int(re.findall(".*?_(\d+)\.", f)[0]), f) for f in files] files = sorted(files, key=operator.itemgetter(0)) checkpoint = files[-1][1] elif not osp.isabs(checkpoint): if not osp.exists(osp.join(logdir, 'checkpoints')): raise FileNotFoundError("Could not find checkpoints folder") else: checkpoint = osp.join(logdir, 'checkpoints', checkpoint) if checkpoint.endswith(".index"): checkpoint = checkpoint[:-len(".index")] print("Loading checkpoint %s." % checkpoint) saver.restore(tf.get_default_session(), checkpoint) # generate rollouts rollouts = [] for i_rollout in tqdm(range(num_rollouts), "Computing rollouts"): observation = env.reset() rollout = {"observation": [], "reward": [], "action": []} video = [] for i_episode in range(max_episode_length): action, _ = pi.act(stochastic=False, ob=observation) observation, reward, done, _ = env.step(action) if human_render: env.render(mode='human') if save_videos is not None: video.append(env.render(mode='rgb_array')) if save_rollouts is not None: rollout["observation"].append(observation) rollout["reward"].append(reward) rollout["action"].append(action) if done: break if save_videos is not None: imageio.mimsave(osp.join(save_videos, 'rollout_%i.mp4' % i_rollout), video, fps=env.metadata.get('video.frames_per_second', 50)) if save_rollouts is not None and save_separate_rollouts: pkl.dump( rollout, open(osp.join(save_rollouts, 'rollout_%i.pkl' % i_rollout), "wb")) else: rollouts.append(rollout) if save_rollouts is not None and not save_separate_rollouts: pkl.dump(rollouts, open(osp.join(save_rollouts, 'rollouts.pkl'), "wb"))