コード例 #1
0
    def test_serialization(self):
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_1", env, hidden_sizes=(16, 16))

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)
            obs_pred = dynamics_model.predict(obs, act, pred_type='mean')

            dump_string = pickle.dumps(dynamics_model)

        tf.reset_default_graph()
        with tf.Session() as sess:
            dynamics_model_loaded = pickle.loads(dump_string)
            # dynamics_model_loaded.fit(obs, act, obs_next, epochs=5)
            obs_pred_loaded = dynamics_model_loaded.predict(obs, act, pred_type='mean')

        diff = np.sum(np.abs(obs_pred_loaded - obs_pred))

        self.assertAlmostEquals(diff, 0, places=2)
コード例 #2
0
    def test_maml_sampling(self):
        # get from data
        # get from data
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100)
        dynamics_model = MLPDynamicsEnsemble("dyn_model3", env, hidden_sizes=(16,16), num_models=4)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        env = TfEnv(normalize(PointEnv()))

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy3",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            grad_step_size=0.1,
            hidden_nonlinearity=tf.nn.tanh,
            trainable_step_size=False,
            bias_transform=False
        )

        from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=1)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )
            algo.meta_batch_size = dynamics_model.num_models

            algo.batch_size_dynamics_samples = algo.batch_size

            algo.dynamics_model = dynamics_model

            itr = 1

            model_sampler = MAMLModelVectorizedSampler(algo)
            model_sampler.start_worker()
            paths = model_sampler.obtain_samples(itr, return_dict=True)
            samples_data = model_sampler.process_samples(itr, paths[0])

            print(samples_data.keys())
コード例 #3
0
    def test_policy_sampling(self):
        # get from data
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100)
        dynamics_model = MLPDynamicsEnsemble("dyn_model1", env, hidden_sizes=(16,16))

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        env = TfEnv(normalize(PointEnv()))

        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(16, 16),
            hidden_nonlinearity=tf.nn.tanh
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)

            algo = ModelMAMLTRPO(
                env=env,
                dynamics_model=dynamics_model,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )

            algo.dynamics_model = dynamics_model

            itr = 1

            model_sampler = ModelVectorizedSampler(algo)
            model_sampler.start_worker()
            paths = model_sampler.obtain_samples(itr)
            samples_data = model_sampler.process_samples(itr, paths)

            print(samples_data.keys())
コード例 #4
0
def run_train_task(vv):

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency'])

    policy = MPCController(
        name="policy",
        env=env,
        dynamics_model=dynamics_model,
        discount=vv['discount'],
        n_candidates=vv['n_candidates'],
        horizon=vv['horizon'],
    )

    algo = ModelMPCBatchPolopt(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        batch_size_env_samples=vv['batch_size_env_samples'],
        initial_random_samples=vv['initial_random_samples'],
        dynamic_model_max_epochs=vv['dynamic_model_epochs'],
        max_path_length=vv['path_length'],
        n_itr=vv['n_itr'],
        discount=vv['discount'],
        step_size=vv["step_size"],
        reinit_model_cycle=vv['reinit_model_cycle'])
    algo.train()
コード例 #5
0
    def test_train_prediction_std(self):
        # just checks if std prediction returns correct shapes
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=5)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)
            std = dynamics_model.predict_std(obs, act)
            self.assertEqual(std.shape, obs.shape)
コード例 #6
0
    def test_train_prediction(self):
        # just checks if training and prediction runs without errors and prediction returns correct shapes
        env = PointEnv()
        np.random.seed(22)
        paths = sample_random_trajectories_point_env(env, num_paths=200, horizon=100)
        dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_2", env, hidden_sizes=(16, 16), num_models=5)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        paths_test = sample_random_trajectories_point_env(env, num_paths=10, horizon=100)
        obs_test = np.concatenate([path['observations'] for path in paths_test], axis=0)
        obs_next_test = np.concatenate([path['next_observations'] for path in paths_test], axis=0)
        act_test = np.concatenate([path['actions'] for path in paths_test], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=10)

            obs_pred1 = dynamics_model.predict(obs_test, act_test, pred_type='mean')
            diff1 = np.mean(np.abs(obs_pred1 - obs_next_test) ** 2)
            self.assertEqual(obs_pred1.shape, obs.shape)
            self.assertLess(diff1, 0.01)

            obs_pred2 = dynamics_model.predict(obs_test, act_test, pred_type='rand')
            diff2 = np.mean(np.abs(obs_pred2 - obs_next_test) ** 2)
            self.assertEqual(obs_pred2.shape, obs.shape)
            self.assertLess(diff2, 0.01)

            obs_pred3 = dynamics_model.predict(obs_test, act_test, pred_type='all')
            self.assertEqual(obs_pred3.shape, obs.shape + (5,))
コード例 #7
0
    def test_predict_model_batches3(self):
        np.random.seed(22)
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_6", env, hidden_sizes=(16, 16), num_models=2)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        obs_stacked = np.concatenate([obs, obs+0.2], axis=0)
        act_stacked = np.concatenate([act+0.1, act], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            dynamics_model.fit(obs, act, obs_next, epochs=5)

            pred_obs = dynamics_model.predict_model_batches(obs_stacked, act_stacked)
            pred_obs_batches = np.split(pred_obs, 2, axis=0)
            for i in range(2):
                if i > 0:
                    act = act - 0.1
                    obs = obs + 0.2
                if i == 0:
                    act = act + 0.1
                pred_obs_single_batch = dynamics_model.predict(obs, act, pred_type='all')[:, :, i]
                diff = np.sum(np.abs(pred_obs_batches[i] - pred_obs_single_batch))
                print(diff)
                self.assertAlmostEquals(diff, 0)
コード例 #8
0
def run_train_task(vv):
    import sys
    print(vv['exp_prefix'])
    sysout_log_path = os.path.join(config.LOG_DIR, 'local', vv['exp_prefix'],
                                   vv['exp_name'], 'stdout.log')
    sysout_log_file = open(sysout_log_path, 'w')
    sys.stdout = sysout_log_file

    env = TfEnv(normalize(vv['env'](log_scale_limit=vv['log_scale_limit'])))

    dynamics_model = MLPDynamicsEnsemble(
        name="dyn_model",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_model'],
        weight_normalization=vv['weight_normalization_model'],
        num_models=vv['num_models'],
        valid_split_ratio=vv['valid_split_ratio'],
        rolling_average_persitency=vv['rolling_average_persitency'])

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=vv['hidden_sizes_policy'],
        hidden_nonlinearity=vv['hidden_nonlinearity_policy'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ModelTRPO(
        env=env,
        policy=policy,
        dynamics_model=dynamics_model,
        baseline=baseline,
        batch_size_env_samples=vv['batch_size_env_samples'],
        batch_size_dynamics_samples=vv['batch_size_dynamics_samples'],
        initial_random_samples=vv['initial_random_samples'],
        num_gradient_steps_per_iter=vv['num_gradient_steps_per_iter'],
        max_path_length=vv['path_length'],
        n_itr=vv['n_itr'],
        retrain_model_when_reward_decreases=vv[
            'retrain_model_when_reward_decreases'],
        discount=vv['discount'],
        step_size=vv["step_size"],
        reset_policy_std=vv['reset_policy_std'],
        reinit_model_cycle=vv['reinit_model_cycle'])
    algo.train()

    sysout_log_file.close()
コード例 #9
0
    def test_predict_model_batches(self):
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = MLPDynamicsEnsemble("dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=1)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)


        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            dynamics_model.fit(obs, act, obs_next, epochs=5)

            pred_obs = dynamics_model.predict_model_batches(obs, act)
            pred_obs_single = dynamics_model.predict(obs, act, pred_type='all')[:, :, 0]
            diff = np.sum(np.abs(pred_obs - pred_obs_single))
            print(diff)
            self.assertAlmostEqual(diff, 0)