Example #1
0
from trlib.environments.dam import Dam
from trlib.policies.valuebased import EpsilonGreedy
from trlib.policies.qfunction import ZeroQ
from sklearn.ensemble.forest import ExtraTreesRegressor
from trlib.algorithms.callbacks import get_callback_list_entry
import numpy as np
from trlib.experiments.experiment import RepeatExperiment
from trlib.utilities.data import load_object
from trlib.algorithms.transfer.wfqi import WFQI, estimate_weights_mean
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from trlib.environments.acrobot_multitask import AcrobotMultitask
""" --- ENVIRONMENTS --- """
target_mdp = AcrobotMultitask(m1=1.0, m2=1.0, l1=1.0, l2=1.0, task="swing-up")

actions = [0, 1]
source_data = [load_object("source_data_" + str(i)) for i in [1, 2]]
""" --- PARAMS --- """

regressor_params = {
    'n_estimators': 50,
    'criterion': 'mse',
    'min_samples_split': 5,
    'min_samples_leaf': 2
}

initial_states = [
    np.array([-2.0, 0., 0., 0.]),
    np.array([-1.5, 0., 0., 0.]),
    np.array([-1.0, 0., 0., 0.]),
    np.array([-0.5, 0., 0., 0.]),
    np.array([0.0, 0., 0., 0.]),
Example #2
0
def generate_source(mdp,
                    n_episodes,
                    test_fraction,
                    file_name,
                    policy=None,
                    policy_file_name=None,
                    kernel_rw=None,
                    kernel_st=None,
                    load_data=False,
                    fit_rw=True,
                    fit_st=True,
                    subtract_noise_rw=False,
                    subtract_noise_st=False):
    """
    Generates source data for wfqi and fits the GPs
    
    Parameters
    ----------
    mdp: the MDP to use
    n_episodes: the number of episodes to collect (if load_data is False)
    test_fraction: fraction of the data used for testing the GPs
    file_name: the file where to load/save
    policy: the policy to use
    policy_file_name: the file where to load the policy (ignored if policy is not None)
    kernel_rw: the kernel for fitting the reward GP
    kernel_st: the kernel for fitting the trasition GP
    load_data: whether data should be loaded or generated
    fit_rw: whether the reward should be fitted
    fit_st: whether the state should be fitted
    subtract_noise_rw: whether the noise fitted by the reward GP should be subtracted
    subtract_noise_st: whether the noise fitted by the transition GP should be subtracted
    """
    if load_data:
        print("Loading data")
        data = load_object(file_name)
        source_samples = data[0]
        rw_pred = data[1]
        st_pred = data[2]
    else:
        print("Collecting episodes")
        source_policy = policy if policy is not None else load_object(
            policy_file_name)
        source_samples = generate_episodes(mdp, source_policy, n_episodes)
        rw_pred = None
        st_pred = None

    a_idx = 1 + mdp.state_dim
    r_idx = a_idx + mdp.action_dim
    s_idx = r_idx + 1

    X = source_samples[:, 1:r_idx]

    if fit_rw:
        print("Fitting reward GP")
        y = source_samples[:, r_idx]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_fraction)
        rw_pred = _fit_gp(X, X_train, X_test, y_train, y_test, kernel_rw,
                          subtract_noise_rw)

    if fit_st:
        st_pred = []
        for d in range(mdp.state_dim):
            print("Fitting transition GP " + str(d))
            y = source_samples[:, (s_idx + d):(s_idx + d + 1)]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_fraction)
            st_pred.append(
                _fit_gp(X, X_train, X_test, y_train, y_test, kernel_st[d],
                        subtract_noise_st))

    data = [source_samples, rw_pred, st_pred]
    save_object(data, file_name)
from trlib.policies.qfunction import ZeroQ
from sklearn.ensemble.forest import ExtraTreesRegressor
from trlib.algorithms.callbacks import get_callback_list_entry
import numpy as np
from trlib.experiments.experiment import RepeatExperiment
from trlib.algorithms.transfer.lazaric2008 import Lazaric2008
from trlib.algorithms.reinforcement.fqi import FQI
from trlib.algorithms.transfer.laroche2017 import Laroche2017
from trlib.utilities.data import load_object
from trlib.environments.dam import Dam
""" --- ENVIRONMENTS --- """
target_mdp = Dam(inflow_profile=1, alpha=0.3, beta=0.7)

actions = [0, 3, 5, 7, 10, 15, 20, 30]
source_data = [
    load_object("source_data_" + str(i))[0] for i in [1, 2, 3, 4, 5, 6]
]
""" --- PARAMS --- """

regressor_params = {
    'n_estimators': 100,
    'criterion': 'mse',
    'min_samples_split': 10
}

initial_states = [np.array([200.0, 1]) for _ in range(10)]

callback_list = []
callback_list.append(
    get_callback_list_entry("eval_greedy_policy_callback",
                            field_name="perf_disc_greedy",
from trlib.utilities.data import load_object
from trlib.policies.policy import Uniform
""" --- ENVIRONMENTS --- """
target_mdp = PuddleWorld(goal_x=5,
                         goal_y=10,
                         puddle_means=[(1.0, 4.0), (1.0, 10.0), (1.0, 8.0),
                                       (6.0, 6.0), (6.0, 4.0)],
                         puddle_var=[(.7, 1.e-5, 1.e-5, .7),
                                     (.8, 1.e-5, 1.e-5, .8),
                                     (.8, 1.e-5, 1.e-5, .8),
                                     (.8, 1.e-5, 1.e-5, .8),
                                     (.8, 1.e-5, 1.e-5, .8)],
                         puddle_slow=False)

actions = [0, 1, 2, 3]
source_data = [load_object("source_data_" + str(i))[0] for i in [1, 2, 3]]
""" --- PARAMS --- """

uniform_policy = Uniform(actions)

regressor_params = {
    'n_estimators': 50,
    'criterion': 'mse',
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

initial_states = [np.array([0., 0.]) for _ in range(5)]

callback_list = []
#callback_list.append(get_callback_list_entry("eval_policy_callback", field_name = "perf_disc", criterion = 'discounted', initial_states = [np.array([0.,0.]) for _ in range(5)]))