def main(): # General env properties env_tgt = gym.make('minigolf-v0') env_src = gym.make('minigolf-v0') param_space_size = 4 state_space_size = 1 env_param_space_size = 4 episode_length = 20 gaussian_transitions = False env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = np.random.normal( np.ones(param_space_size) * 0.2, 0.01) variance_initial_param = 0 variance_action = 0.1 feats = polynomial simulation_param = sc.SimulationParam( mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[0.20097172, 0.20182519, 0.19957835, 0.20096946], [0.34099334, 0.21422279, 0.20053974, 0.20105477], [0.46923638, 0.22986188, 0.20266549, 0.20137892], [0.64977232, 0.26575410, 0.21014003, 0.20300604], [0.89955698, 0.32707635, 0.23490234, 0.21518798], [1.09006747, 0.35577241, 0.24517702, 0.22017502], [1.22329955, 0.40621784, 0.28787368, 0.24836521], [1.34824502, 0.43750823, 0.29981691, 0.25448715], [1.24846429, 0.42882867, 0.27008977, 0.22433061], [1.41946655, 0.53908188, 0.33195278, 0.25586648]] putter_length = np.random.uniform(0.7, 1.0, arguments.n_source_models) friction = np.random.uniform(0.1, 0.15, arguments.n_source_models) hole_size = np.random.uniform(0.10, 0.15, arguments.n_source_models) envs = [[putter_length[i], friction[i], hole_size[i], 0.09] for i in range(arguments.n_source_models)] policy_params = [] env_params = [] num_policy = len(pis) for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) source_envs = [] for param in np.array(envs): source_envs.append(gym.make('minigolf-v0')) source_envs[-1].setParams(param) n_config_cv = policy_params.shape[0] n_source = [arguments.n_source_samples * len(pis) for _ in envs] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) # Envs for discrete model estimation possible_env_params = envs # possible envs are the source envs possible_envs = [] for param in np.array(possible_env_params): possible_envs.append(gym.make('minigolf-v0')) possible_envs[-1].setParams(param) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) model_estimation = 0 off_policy = 0 discrete_estimation = 0 model = None env_src_models = None # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) source_dataset.policy_per_model = num_policy if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: name = estimator else: off_policy = 1 name = estimator[:-3] if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec( env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.131, 0.1, 0.09]]), param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) source_dataset = sc.SourceDataset(*data_sr, 1) elif estimator.endswith("DI"): model_estimation = 1 discrete_estimation = 1 model = Models(possible_envs) elif estimator.endswith("GP") or estimator.endswith( "ES") or estimator.endswith("MI") or estimator.endswith( "NS"): model_estimation = 1 model = ModelEstimatorRKHS( kernel_rho=10, kernel_lambda=[100, 10], sigma_env=env_tgt.sigma_noise, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=1, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1, print_mse=arguments.print_mse, features=polynomial, param_dim=param_space_size, target_env=env_tgt, heteroscedastic=True) if estimator.endswith("GP"): # or estimator.endswith("NS"): model.use_gp = True elif estimator.endswith("MI"): model.use_gp_generate_mixture = True if estimator.endswith("NS"): n_models = int( source_dataset.episodes_per_config.shape[0] / source_dataset.policy_per_model) transition_models = [] for i in range(n_models): model_estimator = ModelEstimatorRKHS( kernel_rho=10, kernel_lambda=[100, 10], sigma_env=env_tgt.sigma_noise, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=1, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1, print_mse=arguments.print_mse, features=polynomial, param_dim=param_space_size, target_env=env_tgt, heteroscedastic=True, max_gp_src=arguments.max_gp_samples_src) transition_models.append(model_estimator) env_src_models = SourceEstimator(source_dataset, transition_models) result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=model_estimation, dicrete_estimation=discrete_estimation, model_estimator=model, verbose=not arguments.quiet, features=polynomial, source_estimator=env_src_models if estimator.endswith("NS") else None) stats[estimator].append(result) return stats
def main(transition_model): """ lqg1d sample reuse """ env_tgt = gym.make('cartpolec-v0') variance_env = 0 env_planning = PlanningEnv(transition_model, env_tgt, np.sqrt(variance_env)) param_space_size = 4 state_space_size = 4 env_param_space_size = 3 episode_length = 200 env_param = sc.EnvParam(env_planning, param_space_size, state_space_size, env_param_space_size, episode_length) mean_initial_param = np.random.normal(np.zeros(param_space_size), 0.01) variance_initial_param = 0 variance_action = 0.1 batch_size = 10 discount_factor = 0.99 ess_min = 25 adaptive = "No" n_min = 5 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, batch_size, num_batch, discount_factor, None, None, ess_min, adaptive, n_min) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) source_dataset_batch_size = 1 policy_params = np.array([[0, 0, 0, 0]]) env_params = np.array([[1.0, 0.5, 0.09]]) name = estimator[:-3] [ source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised ] = stc.sourceTaskCreationSpec(env_tgt, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised, 1) off_policy = 0 name = estimator simulation_param.batch_size = 10 simulation_param.learning_rate = learning_rate result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy) stats[estimator].append(result) return stats
def computeExpectedValueCurrentProposal(self, env, env_param, param_policy, simulation_param, source_parameters, episodes_per_config, initial_size, features=identity): """ This function computes the expected value of the current proposal :param env: Current proposal target env :param env_params: Object that contains all the informations of the target environment :param param_policy: Policy parameter of the current iteration :param simulation_param: Parameters of the simulation :param source_parameters: Environment and policy parameters used to generate the source tasks :param episodes_per_config: Number of episodes for every policy-model configuration :param initial_size: Initial size of the source dataset at the beginning of the learning procedure :param features: The function to apply at the state; it represents the features used for learning the optimal policy :return: A value representing the estimated expected value """ batch_size = 20 [ source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised ] = stc.sourceTaskCreationSpec( env, env_param.episode_length, batch_size, simulation_param.discount_factor, simulation_param.variance_action, param_policy[np.newaxis, :], env.getEnvParam().T, env_param.param_space_size, env_param.state_space_size, env_param.env_param_space_size, features) dataset_current_env = sc.SourceDataset(source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised, 1) [ state_t, state_t1, unclipped_action_t, clipped_actions, trajectories_length ] = self.getEpisodesInfoFromSource(dataset_current_env, env_param)[0:5] source_parameters[initial_size:, 1 + env_param.param_space_size:1 + env_param.param_space_size + env_param.env_param_space_size] = env.getEnvParam().T param_indices = np.concatenate( ([0], np.cumsum(np.delete(episodes_per_config, -1)))) env_param_estimation = source_param[:, 1 + env_param.param_space_size:1 + env_param.param_space_size + env_param.env_param_space_size] param_policy_estimation = source_param[:, 1:1 + env_param.param_space_size] env_param_src = source_parameters[:, 1 + env_param.param_space_size:1 + env_param.param_space_size + env_param.env_param_space_size] param_policy_src = source_parameters[:, 1:1 + env_param.param_space_size] combination_src_parameters = (param_policy_src[param_indices, :]) combination_src_parameters_env = ( env_param_src[param_indices, :] ) #policy parameter of source not repeated state_t = np.repeat(state_t[:, :, :, np.newaxis], combination_src_parameters.shape[0], axis=3) # state t state_t1 = np.repeat(state_t1[:, :, :, np.newaxis], combination_src_parameters.shape[0], axis=3) # state t+1 unclipped_action_t = np.repeat(unclipped_action_t[:, :, np.newaxis], combination_src_parameters.shape[0], axis=2) # action t clipped_actions = np.repeat(clipped_actions[:, :, np.newaxis], combination_src_parameters_env.shape[0], axis=2) # action t variance_env = env_param_estimation[:, -1] # variance of the model transition name = env.unwrapped.spec.id if name == "minigolf-v0": density_current = env.densityCurrent(state_t[:, :, :, 0], clipped_actions[:, :, 0], state_t1[:, :, :, 0]) density = env.density(combination_src_parameters_env, state_t, clipped_actions, state_t1) mis_distributions_model = density model_transition = density_current else: state_t1_denoised_current = env.stepDenoisedCurrent( state_t[:, :, :, 0], clipped_actions[:, :, 0]) state_t1_denoised = env.stepDenoised( combination_src_parameters_env, state_t, clipped_actions) mis_distributions_model = 1 / np.sqrt( (2 * m.pi * variance_env[:, np.newaxis, np.newaxis])** env_param.state_space_size) * np.exp( -np.sum( (np.power( (state_t1 - state_t1_denoised), 2)), axis=2) / (2 * variance_env[:, np.newaxis, np.newaxis]) ) #aòll env transitions model_transition = 1 / np.sqrt( (2 * m.pi * variance_env[:, np.newaxis])** env_param.state_space_size) * np.exp(-(np.sum( (state_t1[:, :, :, 0] - state_t1_denoised_current)**2, axis=2)) / (2 * variance_env[:, np.newaxis])) mask = trajectories_length[:, np.newaxis] < np.repeat( np.arange(0, state_t.shape[1])[np.newaxis, :], repeats=state_t.shape[0], axis=0) feats = features(state_t[:, :, :, 0], mask) feats = np.repeat(feats[:, :, :, np.newaxis], combination_src_parameters.shape[0], axis=3) mis_distributions_policy = 1 / m.sqrt( 2 * m.pi * simulation_param.variance_action) * np.exp( -((unclipped_action_t - np.sum(np.multiply( (combination_src_parameters.T)[np.newaxis, np.newaxis, :, :], feats), axis=2))**2) / (2 * simulation_param.variance_action)) mis_distributions_policy[mask] = 1 mis_distributions_policy[mask] = 1 src_distributions_policy = np.prod(mis_distributions_policy, axis=1) src_distributions_model = np.prod(mis_distributions_model, axis=1) q_j = src_distributions_model * src_distributions_policy policy_transition = 1 / m.sqrt( 2 * m.pi * simulation_param.variance_action) * np.exp( -((unclipped_action_t[:, :, 0] - (np.sum(np.multiply(param_policy[np.newaxis, np.newaxis, :], feats[:, :, :, 0]), axis=2)))**2) / (2 * simulation_param.variance_action)) policy_transition[mask] = 1 model_transition[mask] = 1 policy_transition = np.prod(policy_transition, axis=1) model_transition = np.prod(model_transition, axis=1) n = dataset_current_env.initial_size mis_denominator = np.squeeze( np.asarray( np.sum(np.multiply( dataset_current_env.episodes_per_config[np.newaxis, :] / n, q_j), axis=1))) return np.mean(policy_transition * model_transition / mis_denominator)
def main(): # General env properties env_tgt = gym.make('LQG1D-v0') env_src = gym.make('LQG1D-v0') param_space_size = 1 state_space_size = 1 env_param_space_size = 3 episode_length = 20 gaussian_transition = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transition) mean_initial_param = -0.1 * np.ones(param_space_size) variance_initial_param = 0 variance_action = 0.1 simulation_param = sc.SimulationParam( mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[-0.1], [-0.15], [-0.2], [-0.25], [-0.3], [-0.35], [-0.4], [-0.45]] if arguments.random_src: A = np.random.uniform(0.6, 1.4, arguments.n_source_models) B = np.random.uniform(0.8, 1.2, arguments.n_source_models) else: A = np.array(arguments.src_A) B = np.array(arguments.src_B) envs = [[A[i], B[i], 0.09] for i in range(A.shape[0])] print(envs) policy_params = [] env_params = [] for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) n_config_cv = policy_params.shape[0] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) off_policy = 0 if estimator in [ "GPOMDP", "REINFORCE", "REINFORCE-BASELINE" ] else 1 name = estimator if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec( env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[-0.1]]), np.array([[1.0, 1.0, 0.09]]), param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(*data_sr, 1) name = estimator[:-3] result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=0, dicrete_estimation=0, model_estimator=None, verbose=not arguments.quiet) stats[estimator].append(result) return stats
def main(): # General env properties env_tgt = gym.make('minigolf-v0') env_src = gym.make('minigolf-v0') param_space_size = 4 state_space_size = 1 env_param_space_size = 4 episode_length = 20 gaussian_transitions = False env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = np.random.normal(np.ones(param_space_size) * 0.2, 0.01) variance_initial_param = 0 variance_action = 0.1 feats = polynomial simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[0.20097172, 0.20182519, 0.19957835, 0.20096946], [0.34099334, 0.21422279, 0.20053974, 0.20105477], [0.46923638, 0.22986188, 0.20266549, 0.20137892], [0.64977232, 0.26575410, 0.21014003, 0.20300604], [0.89955698, 0.32707635, 0.23490234, 0.21518798], [1.09006747, 0.35577241, 0.24517702, 0.22017502], [1.22329955, 0.40621784, 0.28787368, 0.24836521], [1.34824502, 0.43750823, 0.29981691, 0.25448715], [1.24846429, 0.42882867, 0.27008977, 0.22433061], [1.41946655, 0.53908188, 0.33195278, 0.25586648]] putter_length = np.random.uniform(0.7, 1.0, arguments.n_source_models) friction = np.random.uniform(0.065, 0.196, arguments.n_source_models) hole_size = np.random.uniform(0.10, 0.15, arguments.n_source_models) envs = [[putter_length[i], friction[i], hole_size[i], 0.09] for i in range(arguments.n_source_models)] policy_params = [] env_params = [] num_policy = len(pis) for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) n_config_cv = policy_params.shape[0] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) source_dataset.policy_per_model = num_policy off_policy = 0 if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"] else 1 name = estimator if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec(env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.131, 0.1, 0.09]]), param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) source_dataset = sc.SourceDataset(*data_sr, 1) source_dataset.policy_per_model = num_policy name = estimator[:-3] result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=0, dicrete_estimation=0, model_estimator=None, verbose=not arguments.quiet, features=feats) stats[estimator].append(result) return stats
def main(): # General env properties env_tgt = gym.make('LQG1D-v0') env_src = gym.make('LQG1D-v0') param_space_size = 1 state_space_size = 1 env_param_space_size = 3 episode_length = 20 gaussian_transitions = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = -0.1 * np.ones(param_space_size) variance_initial_param = 0 variance_action = 0.1 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[-0.1], [-0.2], [-0.3], [-0.4], [-0.5], [-0.6], [-0.7], [-0.8]] A = np.random.uniform(0.6, 1.4, arguments.n_source_models) B = np.random.uniform(0.8, 1.2, arguments.n_source_models) envs = [[A[i], B[i], 0.09] for i in range(A.shape[0])] policy_params = [] env_params = [] for p in pis: for e in envs: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) source_envs = [] for param in np.array(envs): source_envs.append(gym.make('LQG1D-v0')) source_envs[-1].setParams(param) n_config_cv = policy_params.shape[0] n_source = [arguments.n_source_samples * len(pis) for _ in envs] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) # Envs for discrete model estimation possible_env_params = [[1.0, 1.0, 0.09], [1.5, 1.0, 0.09], [0.5, 1.0, 0.09], [1.2, 0.8, 0.09], [0.8, 1.2, 0.09], [1.1, 0.9, 0.09], [0.9, 1.1, 0.09], [1.5, 0.5, 0.09]] possible_envs = [] for param in np.array(possible_env_params): possible_envs.append(gym.make('LQG1D-v0')) possible_envs[-1].setParams(param) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) model_estimation = 0 off_policy = 0 discrete_estimation = 0 model = None # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: name = estimator else: off_policy = 1 name = estimator[:-3] if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec( env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[-0.1]]), np.array([[1.0, 1.0, 0.09]]), param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(*data_sr, 1) elif estimator.endswith("DI"): model_estimation = 1 discrete_estimation = 1 model = Models(possible_envs) elif estimator.endswith("GP") or estimator.endswith( "ES") or estimator.endswith("MI"): model_estimation = 1 model = ModelEstimatorRKHS( kernel_rho=1, kernel_lambda=[1, 1], sigma_env=env_tgt.sigma_noise, sigma_pi=np.sqrt(variance_action), T=episode_length, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=1, linear_kernel=True, balance_coeff=arguments.balance_coeff, target_env=env_tgt if arguments.print_mse else None) if estimator.endswith("GP"): model.use_gp = True elif estimator.endswith("MI"): model.use_gp_generate_mixture = True result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=model_estimation, dicrete_estimation=discrete_estimation, model_estimator=model, verbose=not arguments.quiet) stats[estimator].append(result) return stats
def main(id): # General env properties env_tgt = gym.make('cartpolec-v0') env_src = gym.make('cartpolec-v0') param_space_size = 4 state_space_size = 4 env_param_space_size = 3 episode_length = 200 gaussian_transition = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transition) mean_initial_param = np.random.normal(np.zeros(param_space_size), 0.01) variance_initial_param = 0 variance_action = 0.1 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[-0.04058811, 0.06820783, 0.09962419, -0.01481458], [-0.04327763, 0.01926409, 0.10651812, 0.07304843], [-0.04660533, -0.08301117, 0.14598312, 0.31524803], [-0.04488895, -0.04959011, 0.20856307, 0.52564195], [-0.02085553, 0.11530108, 0.24525215, 0.58338479], [-0.03072567, 0.15546779, 0.27241488, 0.65833969], [-0.05493752, 0.11100809, 0.30213226, 0.73134919], [-0.02389198, 0.18004238, 0.30697023, 0.72447482], [-0.0702051, 0.17653729, 0.32254312, 0.72004621], [-0.09675066, 0.16063462, 0.32343255, 0.73801456]] m = np.random.uniform(0.8, 1.2, arguments.n_source_models) l = np.random.uniform(0.4, 0.6, arguments.n_source_models) envs = [[m[i], l[i], 0.09] for i in range(m.shape[0])] policy_params = [] env_params = [] num_policy = len(pis) for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) source_envs = [] for param in np.array(envs): source_envs.append(gym.make('cartpolec-v0')) source_envs[-1].setParams(param) n_config_cv = policy_params.shape[0] n_source = [arguments.n_source_samples*len(pis) for _ in envs] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) # Envs for discrete model estimation possible_env_params = [[1.0, 0.5, 0.09], [0.8, 0.3, 0.09], [1.2, 0.7, 0.09], [1.1, 0.6, 0.09], [0.9, 0.4, 0.09], [0.9, 0.6, 0.09], [1.1, 0.4, 0.09], [1.5, 1.0, 0.09]] possible_envs = [] for param in np.array(possible_env_params): possible_envs.append(gym.make('cartpolec-v0')) possible_envs[-1].setParams(param) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) model_estimation = 0 off_policy = 0 discrete_estimation = 0 model = None env_src_models = None # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) source_dataset.policy_per_model = num_policy if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: name = estimator else: off_policy = 1 name = estimator[:-3] if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec(env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.5, 0.09]]), param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(*data_sr, 1) elif estimator.endswith("DI"): model_estimation = 1 discrete_estimation = 1 model = Models(possible_envs) elif estimator.endswith("GP") or estimator.endswith("ES") or estimator.endswith("MI") or estimator.endswith("NS"): model_estimation = 1 model = ModelEstimatorRKHS(kernel_rho=1, kernel_lambda=[1, 1, 1, 1, 1], sigma_env=env_tgt.sigma_env, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=4, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1e-5, target_env=env_tgt if arguments.print_mse else None, id=id) if estimator.endswith("GP"): model.use_gp = True elif estimator.endswith("MI"): model.use_gp_generate_mixture = True if estimator.endswith("NS"): n_models = int(source_dataset.episodes_per_config.shape[0]/source_dataset.policy_per_model) transition_models = [] for i in range(n_models): model_estimator = ModelEstimatorRKHS(kernel_rho=1, kernel_lambda=[1, 1, 1, 1, 1], sigma_env=env_tgt.sigma_env, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples_src, state_dim=4, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1e-5, target_env=env_tgt if arguments.print_mse else None, id=id) transition_models.append(model_estimator) env_src_models = SourceEstimator(source_dataset, transition_models) result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=model_estimation, dicrete_estimation=discrete_estimation, model_estimator=model, verbose=not arguments.quiet, dump_model=arguments.dump_estimated_model, iteration_dump=arguments.iteration_dump, source_estimator=env_src_models if estimator.endswith("NS") else None) stats[estimator].append(result) return stats
def main(): """ lqg1d sample reuse """ env_tgt = gym.make('LQG1D-v0') env_src = gym.make('LQG1D-v0') param_space_size = 1 state_space_size = 1 env_param_space_size = 3 episode_length = 20 gaussian_transitions = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = 0 * np.ones(param_space_size) variance_initial_param = 0 variance_action = 0.1 batch_size = 10 discount_factor = 0.99 ess_min = 25 adaptive = "No" n_min = 3 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, batch_size, num_batch, discount_factor, None, None, ess_min, adaptive, n_min) # source task for lqg1d source_dataset_batch_size = 1 discount_factor = 0.99 pis = [[-0.1]]#, [-0.2], [-0.3], [-0.4], [-0.5], [-0.6], [-0.7], [-0.8]] A = np.random.uniform(0.5, 1.5, 1) B = np.random.uniform(0.8, 1.2, 1) variance_env = 0.09 envs = [] for i in range(len(A)): envs.append([A[i], B[i], variance_env]) policy_params = [] env_params = [] for p in pis: for e in envs: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) n_config_cv = policy_params.shape[0] [source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised] = stc.sourceTaskCreationSpec(env_src, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) stats = {} for estimator in estimators: stats[estimator] = [] self_normalised = 0 for estimator,learning_rate in zip(estimators, learning_rates): print(estimator) simulation_param.learning_rate = learning_rate if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: off_policy = 0 name = estimator simulation_param.batch_size = 10 self_normalised = 0 elif estimator == "IS-SN": self_normalised = 1 name = estimator[:-3] #estimator = estimator[:-3] off_policy = 1 elif estimator.endswith("SR"): #if sample reuse source_dataset_batch_size = 1 discount_factor = 0.99 policy_params = np.array([[-1]]) env_params = np.array([[1-5, 1, 0.09]]) n_config_cv = 1 name = estimator[:-3] self_normalised = 0 [source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised] = stc.sourceTaskCreationSpec(env_src, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) else: off_policy = 1 name = estimator self_normalised = 0 simulation_param.learning_rate = learning_rate source_dataset = sc.SourceDataset(source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised, n_config_cv) simulation_param.learning_rate = learning_rate result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, self_normalised=self_normalised) stats[estimator].append(result) return stats