def test_maml_sampling(self): # get from data # get from data env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100) dynamics_model = MLPDynamicsEnsemble("dyn_model3", env, hidden_sizes=(16,16), num_models=4) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) env = TfEnv(normalize(PointEnv())) policy = MAMLImprovedGaussianMLPPolicy( name="policy3", env_spec=env.spec, hidden_sizes=(100, 100), grad_step_size=0.1, hidden_nonlinearity=tf.nn.tanh, trainable_step_size=False, bias_transform=False ) from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=1) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.meta_batch_size = dynamics_model.num_models algo.batch_size_dynamics_samples = algo.batch_size algo.dynamics_model = dynamics_model itr = 1 model_sampler = MAMLModelVectorizedSampler(algo) model_sampler.start_worker() paths = model_sampler.obtain_samples(itr, return_dict=True) samples_data = model_sampler.process_samples(itr, paths[0]) print(samples_data.keys())
def test_policy_sampling(self): # get from data env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100) dynamics_model = MLPDynamicsEnsemble("dyn_model1", env, hidden_sizes=(16,16)) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) env = TfEnv(normalize(PointEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh ) baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) algo = ModelMAMLTRPO( env=env, dynamics_model=dynamics_model, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.dynamics_model = dynamics_model itr = 1 model_sampler = ModelVectorizedSampler(algo) model_sampler.start_worker() paths = model_sampler.obtain_samples(itr) samples_data = model_sampler.process_samples(itr, paths) print(samples_data.keys())
def test_training(self): env = TfEnv(normalize(PointEnv())) tf.set_random_seed(22) np.random.seed(22) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh) baseline = LinearFeatureBaseline(env_spec=env.spec) dynamics_model = MLPDynamicsModel("dyn_model", env, hidden_sizes=(16, 16)) # fit dynamics model algo = ModelTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, batch_size_env_samples=5000, initial_random_samples=10000, batch_size_dynamics_samples=40000, max_path_length=100, dynamic_model_epochs=(30, 10), num_gradient_steps_per_iter=2, n_itr=20, discount=0.99, step_size=0.001, ) algo.train()
def test_train_prediction2(self): # just checks if training and prediction runs without errors and prediction returns correct shapes env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=500, horizon=100) dynamics_model = MLPDynamicsModel("dyn_model_2b", env, hidden_sizes=(32, 32), normalize_input=True) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) obs_test = np.random.uniform(-2, 2, size=(20000, 2)) act_test = np.random.uniform(-0.1, 0.1, size=(20000, 2)) obs_next_test = obs_test + act_test with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=20, verbose=True) obs_next_pred = dynamics_model.predict(obs_test, act_test) mean_diff = np.mean(np.abs(obs_next_test - obs_next_pred)) print("Mean Diff:", mean_diff) self.assertEqual(obs_next_pred.shape, obs_test.shape) self.assertLessEqual(mean_diff, 0.01)
def test_train_prediction(self): # just checks if training and prediction runs without errors and prediction returns correct shapes env = PointEnv() np.random.seed(22) paths = sample_random_trajectories_point_env(env, num_paths=200, horizon=100) dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_2", env, hidden_sizes=(16, 16), num_models=5) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) paths_test = sample_random_trajectories_point_env(env, num_paths=10, horizon=100) obs_test = np.concatenate([path['observations'] for path in paths_test], axis=0) obs_next_test = np.concatenate([path['next_observations'] for path in paths_test], axis=0) act_test = np.concatenate([path['actions'] for path in paths_test], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=10) obs_pred1 = dynamics_model.predict(obs_test, act_test, pred_type='mean') diff1 = np.mean(np.abs(obs_pred1 - obs_next_test) ** 2) self.assertEqual(obs_pred1.shape, obs_test.shape) self.assertLess(diff1, 0.01) obs_pred2 = dynamics_model.predict(obs_test, act_test, pred_type='rand') diff2 = np.mean(np.abs(obs_pred2 - obs_next_test) ** 2) self.assertEqual(obs_pred2.shape, obs_test.shape) self.assertLess(diff2, 0.01) obs_pred3 = dynamics_model.predict(obs_test, act_test, pred_type='all') self.assertEqual(obs_pred3.shape, obs_test.shape + (5,))
def test_predict_model_batches3(self): np.random.seed(22) env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_6", env, hidden_sizes=(16, 16), num_models=2, output_bias_range=0.01, gaussian_noise_output_std=0.01) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) obs_stacked = np.concatenate([obs, obs+0.2], axis=0) act_stacked = np.concatenate([act+0.1, act], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) pred_obs = dynamics_model.predict_model_batches(obs_stacked, act_stacked) pred_obs_batches = np.split(pred_obs, 2, axis=0) for i in range(2): if i > 0: act = act - 0.1 obs = obs + 0.2 if i == 0: act = act + 0.1 pred_obs_single_batch = dynamics_model.predict(obs, act, pred_type='all')[:, :, i] diff = np.sum(np.abs(pred_obs_batches[i] - pred_obs_single_batch)) print(diff) self.assertGreaterEqual(diff, 10.0)
def test_serialization(self): env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_1", env, hidden_sizes=(16, 16)) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) obs_pred = dynamics_model.predict(obs, act, pred_type='mean') dump_string = pickle.dumps(dynamics_model) tf.reset_default_graph() with tf.Session() as sess: dynamics_model_loaded = pickle.loads(dump_string) # dynamics_model_loaded.fit(obs, act, obs_next, epochs=5) obs_pred_loaded = dynamics_model_loaded.predict(obs, act, pred_type='mean') diff = np.sum(np.abs(obs_pred_loaded - obs_pred)) self.assertAlmostEquals(diff, 0, places=2)
def test_train_prediction_std(self): # just checks if std prediction returns correct shapes env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=5) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) std = dynamics_model.predict_std(obs, act) self.assertEqual(std.shape, obs.shape)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='PointEnv') # Experiment meta-params parser.add_argument('--exp_name', type=str, default='mb_mpc') parser.add_argument('--seed', type=int, default=3) parser.add_argument('--render', action='store_true') # Training args parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3) parser.add_argument('--onpol_iters', '-n', type=int, default=15) parser.add_argument('--dyn_iters', '-nd', type=int, default=60) parser.add_argument('--batch_size', '-b', type=int, default=512) # Data collection parser.add_argument('--random_paths', '-r', type=int, default=1000) #TODO change back to 10000 parser.add_argument('--onpol_paths', '-d', type=int, default=10) parser.add_argument('--simulated_paths', '-sp', type=int, default=10) #TODO change back to 1000 parser.add_argument('--ep_len', '-ep', type=int, default=1000) # Neural network architecture args parser.add_argument('--n_layers', '-l', type=int, default=2) parser.add_argument('--size', '-s', type=int, default=500) # MPC Controller parser.add_argument('--mpc_horizon', '-m', type=int, default=15) args = parser.parse_args() # Set seed np.random.seed(args.seed) tf.set_random_seed(args.seed) # Make env if args.env_name is "PointEnv": env = PointEnv() reward_fn = reward_fn_point_env train(env=env, reward_fn=reward_fn, render=args.render, learning_rate=args.learning_rate, onpol_iters=args.onpol_iters, dynamics_iters=args.dyn_iters, batch_size=args.batch_size, num_paths_random=args.random_paths, num_paths_onpol=args.onpol_paths, num_simulated_paths=args.simulated_paths, env_horizon=args.ep_len, mpc_horizon=args.mpc_horizon, )
def test_predict_model_batches(self): env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=1) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) pred_obs = dynamics_model.predict_model_batches(obs, act) pred_obs_single = dynamics_model.predict(obs, act, pred_type='all')[:, :, 0] diff = np.sum(np.abs(pred_obs - pred_obs_single)) print(diff) self.assertAlmostEqual(diff, 0)
def test_train_prediction1(self): env = PointEnv() obs = np.random.uniform(-2, 2, size=(20000, 2)) act = np.random.uniform(-0.1, 0.1, size=(20000, 2)) next_obs = obs + act dynamics_model = MLPDynamicsModel("dyn_model_2a", env, hidden_sizes=(32, 32), normalize_input=False) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, next_obs, epochs=10, verbose=True) obs_test = np.random.uniform(-2, 2, size=(20000, 2)) act_test = np.random.uniform(-0.1, 0.1, size=(20000, 2)) obs_next_test = obs_test + act_test obs_next_pred = dynamics_model.predict(obs_test, act_test) mean_diff = np.mean(np.abs(obs_next_test - obs_next_pred)) print("Mean Diff:", mean_diff) self.assertEqual(obs_next_pred.shape, obs_test.shape) self.assertLessEqual(mean_diff, 0.01)
def test_predict_model_batches2(self): np.random.seed(22) env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_5", env, hidden_sizes=(16, 16), num_models=2, output_bias_range=0.0, gaussian_noise_output_std=0.0) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) pred_obs = dynamics_model.predict_model_batches(obs, act) pred_obs_batches = np.split(pred_obs, 2, axis=0) for i in range(2): pred_obs_single_batch = dynamics_model.predict(obs[(i*5000):((i+1)*5000)], act[(i*5000):((i+1)*5000)], pred_type='all')[:, :, i] diff = np.sum(np.abs(pred_obs_batches[i] - pred_obs_single_batch)) print(diff) self.assertAlmostEquals(diff, 0)
def test_train_prediction_performance(self): # just checks if training and prediction runs without errors and prediction returns correct shapes env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=500, horizon=500) dynamics_model = MLPDynamicsModel("dyn_model_3", env, hidden_sizes=(16, 16)) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) paths_test = sample_random_trajectories_point_env(env, num_paths=10, horizon=100) obs_test = np.concatenate([path['observations'] for path in paths_test], axis=0) obs_next_test = np.concatenate([path['next_observations'] for path in paths_test], axis=0) act_test = np.concatenate([path['actions'] for path in paths_test], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=20) next_obs_pred = dynamics_model.predict(obs_test, act_test) diff = np.mean(np.abs(next_obs_pred-obs_next_test)**2) print("DIFF:", diff) self.assertLess(diff, 0.05)