def test_serialization(self): env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_1", env, hidden_sizes=(16, 16)) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) obs_pred = dynamics_model.predict(obs, act, pred_type='mean') dump_string = pickle.dumps(dynamics_model) tf.reset_default_graph() with tf.Session() as sess: dynamics_model_loaded = pickle.loads(dump_string) # dynamics_model_loaded.fit(obs, act, obs_next, epochs=5) obs_pred_loaded = dynamics_model_loaded.predict(obs, act, pred_type='mean') diff = np.sum(np.abs(obs_pred_loaded - obs_pred)) self.assertAlmostEquals(diff, 0, places=2)
def test_maml_sampling(self): # get from data # get from data env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100) dynamics_model = MLPDynamicsEnsemble("dyn_model3", env, hidden_sizes=(16,16), num_models=4) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) env = TfEnv(normalize(PointEnv())) policy = MAMLImprovedGaussianMLPPolicy( name="policy3", env_spec=env.spec, hidden_sizes=(100, 100), grad_step_size=0.1, hidden_nonlinearity=tf.nn.tanh, trainable_step_size=False, bias_transform=False ) from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=1) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.meta_batch_size = dynamics_model.num_models algo.batch_size_dynamics_samples = algo.batch_size algo.dynamics_model = dynamics_model itr = 1 model_sampler = MAMLModelVectorizedSampler(algo) model_sampler.start_worker() paths = model_sampler.obtain_samples(itr, return_dict=True) samples_data = model_sampler.process_samples(itr, paths[0]) print(samples_data.keys())
def test_policy_sampling(self): # get from data env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100) dynamics_model = MLPDynamicsEnsemble("dyn_model1", env, hidden_sizes=(16,16)) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) env = TfEnv(normalize(PointEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh ) baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) algo = ModelMAMLTRPO( env=env, dynamics_model=dynamics_model, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.dynamics_model = dynamics_model itr = 1 model_sampler = ModelVectorizedSampler(algo) model_sampler.start_worker() paths = model_sampler.obtain_samples(itr) samples_data = model_sampler.process_samples(itr, paths) print(samples_data.keys())
def run_train_task(vv): env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency']) policy = MPCController( name="policy", env=env, dynamics_model=dynamics_model, discount=vv['discount'], n_candidates=vv['n_candidates'], horizon=vv['horizon'], ) algo = ModelMPCBatchPolopt( env=env, policy=policy, dynamics_model=dynamics_model, batch_size_env_samples=vv['batch_size_env_samples'], initial_random_samples=vv['initial_random_samples'], dynamic_model_max_epochs=vv['dynamic_model_epochs'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], reinit_model_cycle=vv['reinit_model_cycle']) algo.train()
def test_train_prediction_std(self): # just checks if std prediction returns correct shapes env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=5) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) std = dynamics_model.predict_std(obs, act) self.assertEqual(std.shape, obs.shape)
def test_train_prediction(self): # just checks if training and prediction runs without errors and prediction returns correct shapes env = PointEnv() np.random.seed(22) paths = sample_random_trajectories_point_env(env, num_paths=200, horizon=100) dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_2", env, hidden_sizes=(16, 16), num_models=5) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) paths_test = sample_random_trajectories_point_env(env, num_paths=10, horizon=100) obs_test = np.concatenate([path['observations'] for path in paths_test], axis=0) obs_next_test = np.concatenate([path['next_observations'] for path in paths_test], axis=0) act_test = np.concatenate([path['actions'] for path in paths_test], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=10) obs_pred1 = dynamics_model.predict(obs_test, act_test, pred_type='mean') diff1 = np.mean(np.abs(obs_pred1 - obs_next_test) ** 2) self.assertEqual(obs_pred1.shape, obs.shape) self.assertLess(diff1, 0.01) obs_pred2 = dynamics_model.predict(obs_test, act_test, pred_type='rand') diff2 = np.mean(np.abs(obs_pred2 - obs_next_test) ** 2) self.assertEqual(obs_pred2.shape, obs.shape) self.assertLess(diff2, 0.01) obs_pred3 = dynamics_model.predict(obs_test, act_test, pred_type='all') self.assertEqual(obs_pred3.shape, obs.shape + (5,))
def test_predict_model_batches3(self): np.random.seed(22) env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_6", env, hidden_sizes=(16, 16), num_models=2) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) obs_stacked = np.concatenate([obs, obs+0.2], axis=0) act_stacked = np.concatenate([act+0.1, act], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) pred_obs = dynamics_model.predict_model_batches(obs_stacked, act_stacked) pred_obs_batches = np.split(pred_obs, 2, axis=0) for i in range(2): if i > 0: act = act - 0.1 obs = obs + 0.2 if i == 0: act = act + 0.1 pred_obs_single_batch = dynamics_model.predict(obs, act, pred_type='all')[:, :, i] diff = np.sum(np.abs(pred_obs_batches[i] - pred_obs_single_batch)) print(diff) self.assertAlmostEquals(diff, 0)
def run_train_task(vv): import sys print(vv['exp_prefix']) sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'], vv['exp_name'], 'stdout.log') sysout_log_file = open(sysout_log_path, 'w') sys.stdout = sysout_log_file env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit']))) dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_model'], weight_normalization=vv['weight_normalization_model'], num_models=vv['num_models'], valid_split_ratio=vv['valid_split_ratio'], rolling_average_persitency=vv['rolling_average_persitency']) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes_policy'], hidden_nonlinearity=vv['hidden_nonlinearity_policy'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ModelTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, batch_size_env_samples=vv['batch_size_env_samples'], batch_size_dynamics_samples=vv['batch_size_dynamics_samples'], initial_random_samples=vv['initial_random_samples'], num_gradient_steps_per_iter=vv['num_gradient_steps_per_iter'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], retrain_model_when_reward_decreases=vv[ 'retrain_model_when_reward_decreases'], discount=vv['discount'], step_size=vv["step_size"], reset_policy_std=vv['reset_policy_std'], reinit_model_cycle=vv['reinit_model_cycle']) algo.train() sysout_log_file.close()
def test_predict_model_batches(self): env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10) dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=1) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) pred_obs = dynamics_model.predict_model_batches(obs, act) pred_obs_single = dynamics_model.predict(obs, act, pred_type='all')[:, :, 0] diff = np.sum(np.abs(pred_obs - pred_obs_single)) print(diff) self.assertAlmostEqual(diff, 0)