from trlib.environments.dam import Dam from trlib.policies.valuebased import EpsilonGreedy from trlib.policies.qfunction import ZeroQ from sklearn.ensemble.forest import ExtraTreesRegressor from trlib.algorithms.callbacks import get_callback_list_entry import numpy as np from trlib.experiments.experiment import RepeatExperiment from trlib.utilities.data import load_object from trlib.algorithms.transfer.wfqi import WFQI, estimate_weights_mean from sklearn.gaussian_process.kernels import RBF, ConstantKernel from trlib.environments.acrobot_multitask import AcrobotMultitask """ --- ENVIRONMENTS --- """ target_mdp = AcrobotMultitask(m1=1.0, m2=1.0, l1=1.0, l2=1.0, task="swing-up") actions = [0, 1] source_data = [load_object("source_data_" + str(i)) for i in [1, 2]] """ --- PARAMS --- """ regressor_params = { 'n_estimators': 50, 'criterion': 'mse', 'min_samples_split': 5, 'min_samples_leaf': 2 } initial_states = [ np.array([-2.0, 0., 0., 0.]), np.array([-1.5, 0., 0., 0.]), np.array([-1.0, 0., 0., 0.]), np.array([-0.5, 0., 0., 0.]), np.array([0.0, 0., 0., 0.]),
def generate_source(mdp, n_episodes, test_fraction, file_name, policy=None, policy_file_name=None, kernel_rw=None, kernel_st=None, load_data=False, fit_rw=True, fit_st=True, subtract_noise_rw=False, subtract_noise_st=False): """ Generates source data for wfqi and fits the GPs Parameters ---------- mdp: the MDP to use n_episodes: the number of episodes to collect (if load_data is False) test_fraction: fraction of the data used for testing the GPs file_name: the file where to load/save policy: the policy to use policy_file_name: the file where to load the policy (ignored if policy is not None) kernel_rw: the kernel for fitting the reward GP kernel_st: the kernel for fitting the trasition GP load_data: whether data should be loaded or generated fit_rw: whether the reward should be fitted fit_st: whether the state should be fitted subtract_noise_rw: whether the noise fitted by the reward GP should be subtracted subtract_noise_st: whether the noise fitted by the transition GP should be subtracted """ if load_data: print("Loading data") data = load_object(file_name) source_samples = data[0] rw_pred = data[1] st_pred = data[2] else: print("Collecting episodes") source_policy = policy if policy is not None else load_object( policy_file_name) source_samples = generate_episodes(mdp, source_policy, n_episodes) rw_pred = None st_pred = None a_idx = 1 + mdp.state_dim r_idx = a_idx + mdp.action_dim s_idx = r_idx + 1 X = source_samples[:, 1:r_idx] if fit_rw: print("Fitting reward GP") y = source_samples[:, r_idx] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_fraction) rw_pred = _fit_gp(X, X_train, X_test, y_train, y_test, kernel_rw, subtract_noise_rw) if fit_st: st_pred = [] for d in range(mdp.state_dim): print("Fitting transition GP " + str(d)) y = source_samples[:, (s_idx + d):(s_idx + d + 1)] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_fraction) st_pred.append( _fit_gp(X, X_train, X_test, y_train, y_test, kernel_st[d], subtract_noise_st)) data = [source_samples, rw_pred, st_pred] save_object(data, file_name)
from trlib.policies.qfunction import ZeroQ from sklearn.ensemble.forest import ExtraTreesRegressor from trlib.algorithms.callbacks import get_callback_list_entry import numpy as np from trlib.experiments.experiment import RepeatExperiment from trlib.algorithms.transfer.lazaric2008 import Lazaric2008 from trlib.algorithms.reinforcement.fqi import FQI from trlib.algorithms.transfer.laroche2017 import Laroche2017 from trlib.utilities.data import load_object from trlib.environments.dam import Dam """ --- ENVIRONMENTS --- """ target_mdp = Dam(inflow_profile=1, alpha=0.3, beta=0.7) actions = [0, 3, 5, 7, 10, 15, 20, 30] source_data = [ load_object("source_data_" + str(i))[0] for i in [1, 2, 3, 4, 5, 6] ] """ --- PARAMS --- """ regressor_params = { 'n_estimators': 100, 'criterion': 'mse', 'min_samples_split': 10 } initial_states = [np.array([200.0, 1]) for _ in range(10)] callback_list = [] callback_list.append( get_callback_list_entry("eval_greedy_policy_callback", field_name="perf_disc_greedy",
from trlib.utilities.data import load_object from trlib.policies.policy import Uniform """ --- ENVIRONMENTS --- """ target_mdp = PuddleWorld(goal_x=5, goal_y=10, puddle_means=[(1.0, 4.0), (1.0, 10.0), (1.0, 8.0), (6.0, 6.0), (6.0, 4.0)], puddle_var=[(.7, 1.e-5, 1.e-5, .7), (.8, 1.e-5, 1.e-5, .8), (.8, 1.e-5, 1.e-5, .8), (.8, 1.e-5, 1.e-5, .8), (.8, 1.e-5, 1.e-5, .8)], puddle_slow=False) actions = [0, 1, 2, 3] source_data = [load_object("source_data_" + str(i))[0] for i in [1, 2, 3]] """ --- PARAMS --- """ uniform_policy = Uniform(actions) regressor_params = { 'n_estimators': 50, 'criterion': 'mse', 'min_samples_split': 2, 'min_samples_leaf': 1 } initial_states = [np.array([0., 0.]) for _ in range(5)] callback_list = [] #callback_list.append(get_callback_list_entry("eval_policy_callback", field_name = "perf_disc", criterion = 'discounted', initial_states = [np.array([0.,0.]) for _ in range(5)]))