def main():

    # General env properties
    env_tgt = gym.make('minigolf-v0')
    env_src = gym.make('minigolf-v0')
    param_space_size = 4
    state_space_size = 1
    env_param_space_size = 4
    episode_length = 20
    gaussian_transitions = False

    env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size,
                            env_param_space_size, episode_length,
                            gaussian_transitions)

    mean_initial_param = np.random.normal(
        np.ones(param_space_size) * 0.2, 0.01)
    variance_initial_param = 0
    variance_action = 0.1
    feats = polynomial

    simulation_param = sc.SimulationParam(
        mean_initial_param,
        variance_initial_param,
        variance_action,
        arguments.batch_size,
        arguments.iterations,
        arguments.gamma,
        None,
        arguments.learning_rate,
        arguments.ess_min,
        "Yes" if arguments.adaptive else "No",
        arguments.n_min,
        use_adam=arguments.use_adam)

    # Source tasks
    pis = [[0.20097172, 0.20182519, 0.19957835, 0.20096946],
           [0.34099334, 0.21422279, 0.20053974, 0.20105477],
           [0.46923638, 0.22986188, 0.20266549, 0.20137892],
           [0.64977232, 0.26575410, 0.21014003, 0.20300604],
           [0.89955698, 0.32707635, 0.23490234, 0.21518798],
           [1.09006747, 0.35577241, 0.24517702, 0.22017502],
           [1.22329955, 0.40621784, 0.28787368, 0.24836521],
           [1.34824502, 0.43750823, 0.29981691, 0.25448715],
           [1.24846429, 0.42882867, 0.27008977, 0.22433061],
           [1.41946655, 0.53908188, 0.33195278, 0.25586648]]

    putter_length = np.random.uniform(0.7, 1.0, arguments.n_source_models)
    friction = np.random.uniform(0.1, 0.15, arguments.n_source_models)
    hole_size = np.random.uniform(0.10, 0.15, arguments.n_source_models)
    envs = [[putter_length[i], friction[i], hole_size[i], 0.09]
            for i in range(arguments.n_source_models)]

    policy_params = []
    env_params = []
    num_policy = len(pis)

    for e in envs:
        for p in pis:
            policy_params.append(p)
            env_params.append(e)

    policy_params = np.array(policy_params)
    env_params = np.array(env_params)

    source_envs = []
    for param in np.array(envs):
        source_envs.append(gym.make('minigolf-v0'))
        source_envs[-1].setParams(param)
    n_config_cv = policy_params.shape[0]
    n_source = [arguments.n_source_samples * len(pis) for _ in envs]

    data = stc.sourceTaskCreationSpec(env_src,
                                      episode_length,
                                      arguments.n_source_samples,
                                      arguments.gamma,
                                      variance_action,
                                      policy_params,
                                      env_params,
                                      param_space_size,
                                      state_space_size,
                                      env_param_space_size,
                                      features=feats,
                                      env_target=env_tgt)

    # Envs for discrete model estimation
    possible_env_params = envs  # possible envs are the source envs

    possible_envs = []
    for param in np.array(possible_env_params):
        possible_envs.append(gym.make('minigolf-v0'))
        possible_envs[-1].setParams(param)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    for estimator in estimators:

        print(estimator)

        model_estimation = 0
        off_policy = 0
        discrete_estimation = 0
        model = None
        env_src_models = None

        # Create a new dataset object
        source_dataset = sc.SourceDataset(*data, n_config_cv)
        source_dataset.policy_per_model = num_policy

        if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]:
            name = estimator
        else:
            off_policy = 1
            name = estimator[:-3]

            if estimator.endswith("SR"):
                # Create a fake dataset for the sample-reuse algorithm
                data_sr = stc.sourceTaskCreationSpec(
                    env_src,
                    episode_length,
                    1,
                    arguments.gamma,
                    variance_action,
                    np.array([[0, 0, 0, 0]]),
                    np.array([[1.0, 0.131, 0.1, 0.09]]),
                    param_space_size,
                    state_space_size,
                    env_param_space_size,
                    features=feats,
                    env_target=env_tgt)
                source_dataset = sc.SourceDataset(*data_sr, 1)
            elif estimator.endswith("DI"):
                model_estimation = 1
                discrete_estimation = 1
                model = Models(possible_envs)
            elif estimator.endswith("GP") or estimator.endswith(
                    "ES") or estimator.endswith("MI") or estimator.endswith(
                        "NS"):
                model_estimation = 1
                model = ModelEstimatorRKHS(
                    kernel_rho=10,
                    kernel_lambda=[100, 10],
                    sigma_env=env_tgt.sigma_noise,
                    sigma_pi=np.sqrt(variance_action),
                    T=arguments.rkhs_horizon,
                    R=arguments.rkhs_samples,
                    lambda_=0.0,
                    source_envs=source_envs,
                    n_source=n_source,
                    max_gp=arguments.max_gp_samples,
                    state_dim=1,
                    linear_kernel=False,
                    balance_coeff=arguments.balance_coeff,
                    alpha_gp=1,
                    print_mse=arguments.print_mse,
                    features=polynomial,
                    param_dim=param_space_size,
                    target_env=env_tgt,
                    heteroscedastic=True)
                if estimator.endswith("GP"):  # or estimator.endswith("NS"):
                    model.use_gp = True
                elif estimator.endswith("MI"):
                    model.use_gp_generate_mixture = True
                if estimator.endswith("NS"):
                    n_models = int(
                        source_dataset.episodes_per_config.shape[0] /
                        source_dataset.policy_per_model)
                    transition_models = []
                    for i in range(n_models):
                        model_estimator = ModelEstimatorRKHS(
                            kernel_rho=10,
                            kernel_lambda=[100, 10],
                            sigma_env=env_tgt.sigma_noise,
                            sigma_pi=np.sqrt(variance_action),
                            T=arguments.rkhs_horizon,
                            R=arguments.rkhs_samples,
                            lambda_=0.0,
                            source_envs=source_envs,
                            n_source=n_source,
                            max_gp=arguments.max_gp_samples,
                            state_dim=1,
                            linear_kernel=False,
                            balance_coeff=arguments.balance_coeff,
                            alpha_gp=1,
                            print_mse=arguments.print_mse,
                            features=polynomial,
                            param_dim=param_space_size,
                            target_env=env_tgt,
                            heteroscedastic=True,
                            max_gp_src=arguments.max_gp_samples_src)
                        transition_models.append(model_estimator)
                    env_src_models = SourceEstimator(source_dataset,
                                                     transition_models)
        result = la.learnPolicy(env_param,
                                simulation_param,
                                source_dataset,
                                name,
                                off_policy=off_policy,
                                model_estimation=model_estimation,
                                dicrete_estimation=discrete_estimation,
                                model_estimator=model,
                                verbose=not arguments.quiet,
                                features=polynomial,
                                source_estimator=env_src_models
                                if estimator.endswith("NS") else None)

        stats[estimator].append(result)

    return stats
                                      batch_size, num_batch, discount_factor,
                                      None, None, ess_min, adaptive, n_min)

features = identity

env_tgt = gym.make('cartpolec-v0')

variance_env = env_tgt.getEnvParam()[-1]
param_space_size = 4
state_space_size = 4
env_param_space_size = 3
episode_length = 200
discount_factor_timestep = np.asarray(
    [discount_factor**i for i in range(episode_length)])

env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size,
                        env_param_space_size, episode_length)

rewards = {}

for model_dump in range(n_dump):
    rewards[model_dump] = []

for model_dump in range(n_dump):
    for run_dump in range(n_run_dump):
        with open(
                folder + '/results_' + str(run_dump) + "_" + str(model_dump) +
                '.pkl', 'rb') as input:
            results = pickle.load(input)

        for i in range(n_runs):
            policy_current_run = results[i]["GPOMDP"][0].policy_parameter
def main(transition_model):
    """
    lqg1d sample reuse
    """
    env_tgt = gym.make('cartpolec-v0')

    variance_env = 0

    env_planning = PlanningEnv(transition_model, env_tgt,
                               np.sqrt(variance_env))
    param_space_size = 4
    state_space_size = 4
    env_param_space_size = 3
    episode_length = 200

    env_param = sc.EnvParam(env_planning, param_space_size, state_space_size,
                            env_param_space_size, episode_length)

    mean_initial_param = np.random.normal(np.zeros(param_space_size), 0.01)
    variance_initial_param = 0
    variance_action = 0.1
    batch_size = 10
    discount_factor = 0.99
    ess_min = 25
    adaptive = "No"
    n_min = 5

    simulation_param = sc.SimulationParam(mean_initial_param,
                                          variance_initial_param,
                                          variance_action, batch_size,
                                          num_batch, discount_factor, None,
                                          None, ess_min, adaptive, n_min)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    for estimator in estimators:

        print(estimator)

        source_dataset_batch_size = 1
        policy_params = np.array([[0, 0, 0, 0]])
        env_params = np.array([[1.0, 0.5, 0.09]])
        name = estimator[:-3]
        [
            source_task, source_param, episodes_per_configuration,
            next_states_unclipped, actions_clipped,
            next_states_unclipped_denoised
        ] = stc.sourceTaskCreationSpec(env_tgt, episode_length,
                                       source_dataset_batch_size,
                                       discount_factor, variance_action,
                                       policy_params, env_params,
                                       param_space_size, state_space_size,
                                       env_param_space_size)

        source_dataset = sc.SourceDataset(source_task, source_param,
                                          episodes_per_configuration,
                                          next_states_unclipped,
                                          actions_clipped,
                                          next_states_unclipped_denoised, 1)

        off_policy = 0
        name = estimator
        simulation_param.batch_size = 10

        simulation_param.learning_rate = learning_rate

        result = la.learnPolicy(env_param,
                                simulation_param,
                                source_dataset,
                                name,
                                off_policy=off_policy)

        stats[estimator].append(result)

    return stats
def main():

    # General env properties
    env_tgt = gym.make('LQG1D-v0')
    env_src = gym.make('LQG1D-v0')
    param_space_size = 1
    state_space_size = 1
    env_param_space_size = 3
    episode_length = 20
    gaussian_transition = True

    env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size,
                            env_param_space_size, episode_length,
                            gaussian_transition)

    mean_initial_param = -0.1 * np.ones(param_space_size)
    variance_initial_param = 0
    variance_action = 0.1

    simulation_param = sc.SimulationParam(
        mean_initial_param,
        variance_initial_param,
        variance_action,
        arguments.batch_size,
        arguments.iterations,
        arguments.gamma,
        None,
        arguments.learning_rate,
        arguments.ess_min,
        "Yes" if arguments.adaptive else "No",
        arguments.n_min,
        use_adam=arguments.use_adam)

    # Source tasks
    pis = [[-0.1], [-0.15], [-0.2], [-0.25], [-0.3], [-0.35], [-0.4], [-0.45]]
    if arguments.random_src:
        A = np.random.uniform(0.6, 1.4, arguments.n_source_models)
        B = np.random.uniform(0.8, 1.2, arguments.n_source_models)
    else:
        A = np.array(arguments.src_A)
        B = np.array(arguments.src_B)
    envs = [[A[i], B[i], 0.09] for i in range(A.shape[0])]
    print(envs)
    policy_params = []
    env_params = []

    for e in envs:
        for p in pis:
            policy_params.append(p)
            env_params.append(e)

    policy_params = np.array(policy_params)
    env_params = np.array(env_params)

    n_config_cv = policy_params.shape[0]

    data = stc.sourceTaskCreationSpec(env_src, episode_length,
                                      arguments.n_source_samples,
                                      arguments.gamma, variance_action,
                                      policy_params, env_params,
                                      param_space_size, state_space_size,
                                      env_param_space_size)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    for estimator in estimators:

        print(estimator)

        # Create a new dataset object
        source_dataset = sc.SourceDataset(*data, n_config_cv)

        off_policy = 0 if estimator in [
            "GPOMDP", "REINFORCE", "REINFORCE-BASELINE"
        ] else 1

        name = estimator

        if estimator.endswith("SR"):
            # Create a fake dataset for the sample-reuse algorithm
            data_sr = stc.sourceTaskCreationSpec(
                env_src, episode_length, 1, arguments.gamma, variance_action,
                np.array([[-0.1]]), np.array([[1.0, 1.0, 0.09]]),
                param_space_size, state_space_size, env_param_space_size)
            source_dataset = sc.SourceDataset(*data_sr, 1)
            name = estimator[:-3]

        result = la.learnPolicy(env_param,
                                simulation_param,
                                source_dataset,
                                name,
                                off_policy=off_policy,
                                model_estimation=0,
                                dicrete_estimation=0,
                                model_estimator=None,
                                verbose=not arguments.quiet)

        stats[estimator].append(result)

    return stats
def main():

    # General env properties
    env_tgt = gym.make('minigolf-v0')
    env_src = gym.make('minigolf-v0')
    param_space_size = 4
    state_space_size = 1
    env_param_space_size = 4
    episode_length = 20
    gaussian_transitions = False

    env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions)

    mean_initial_param = np.random.normal(np.ones(param_space_size) * 0.2, 0.01)
    variance_initial_param = 0
    variance_action = 0.1
    feats = polynomial

    simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size,
                                          arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min,
                                          "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam)

    # Source tasks
    pis = [[0.20097172, 0.20182519, 0.19957835, 0.20096946],
           [0.34099334, 0.21422279, 0.20053974, 0.20105477],
           [0.46923638, 0.22986188, 0.20266549, 0.20137892],
           [0.64977232, 0.26575410, 0.21014003, 0.20300604],
           [0.89955698, 0.32707635, 0.23490234, 0.21518798],
           [1.09006747, 0.35577241, 0.24517702, 0.22017502],
           [1.22329955, 0.40621784, 0.28787368, 0.24836521],
           [1.34824502, 0.43750823, 0.29981691, 0.25448715],
           [1.24846429, 0.42882867, 0.27008977, 0.22433061],
           [1.41946655, 0.53908188, 0.33195278, 0.25586648]]

    putter_length = np.random.uniform(0.7, 1.0, arguments.n_source_models)
    friction = np.random.uniform(0.065, 0.196, arguments.n_source_models)
    hole_size = np.random.uniform(0.10, 0.15, arguments.n_source_models)
    envs = [[putter_length[i], friction[i], hole_size[i], 0.09] for i in range(arguments.n_source_models)]

    policy_params = []
    env_params = []
    num_policy = len(pis)
    for e in envs:
        for p in pis:
            policy_params.append(p)
            env_params.append(e)

    policy_params = np.array(policy_params)
    env_params = np.array(env_params)

    n_config_cv = policy_params.shape[0]

    data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action,
                                      policy_params, env_params, param_space_size, state_space_size, env_param_space_size,
                                      features=feats, env_target=env_tgt)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    for estimator in estimators:

        print(estimator)

        # Create a new dataset object
        source_dataset = sc.SourceDataset(*data, n_config_cv)
        source_dataset.policy_per_model = num_policy

        off_policy = 0 if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"] else 1

        name = estimator

        if estimator.endswith("SR"):
            # Create a fake dataset for the sample-reuse algorithm
            data_sr = stc.sourceTaskCreationSpec(env_src, episode_length, 1, arguments.gamma, variance_action,
                                                 np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.131, 0.1, 0.09]]),
                                                 param_space_size, state_space_size, env_param_space_size, features=feats,
                                                 env_target=env_tgt)
            source_dataset = sc.SourceDataset(*data_sr, 1)
            source_dataset.policy_per_model = num_policy
            name = estimator[:-3]

        result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy,
                                model_estimation=0, dicrete_estimation=0,
                                model_estimator=None, verbose=not arguments.quiet, features=feats)

        stats[estimator].append(result)

    return stats
Example #6
0
def main():

    # General env properties
    env_tgt = gym.make('LQG1D-v0')
    env_src = gym.make('LQG1D-v0')
    param_space_size = 1
    state_space_size = 1
    env_param_space_size = 3
    episode_length = 20
    gaussian_transitions = True

    env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size,
                            env_param_space_size, episode_length,
                            gaussian_transitions)

    mean_initial_param = -0.1 * np.ones(param_space_size)
    variance_initial_param = 0
    variance_action = 0.1

    simulation_param = sc.SimulationParam(mean_initial_param,
                                          variance_initial_param,
                                          variance_action,
                                          arguments.batch_size,
                                          arguments.iterations,
                                          arguments.gamma,
                                          None,
                                          arguments.learning_rate,
                                          arguments.ess_min,
                                          "No",
                                          arguments.n_min,
                                          use_adam=arguments.use_adam)

    # Source tasks
    pis = [[-0.1], [-0.2], [-0.3], [-0.4], [-0.5], [-0.6], [-0.7], [-0.8]]
    A = np.random.uniform(0.6, 1.4, arguments.n_source_models)
    B = np.random.uniform(0.8, 1.2, arguments.n_source_models)
    envs = [[A[i], B[i], 0.09] for i in range(A.shape[0])]

    policy_params = []
    env_params = []

    for p in pis:
        for e in envs:
            policy_params.append(p)
            env_params.append(e)

    policy_params = np.array(policy_params)
    env_params = np.array(env_params)

    source_envs = []
    for param in np.array(envs):
        source_envs.append(gym.make('LQG1D-v0'))
        source_envs[-1].setParams(param)
    n_config_cv = policy_params.shape[0]
    n_source = [arguments.n_source_samples * len(pis) for _ in envs]

    data = stc.sourceTaskCreationSpec(env_src, episode_length,
                                      arguments.n_source_samples,
                                      arguments.gamma, variance_action,
                                      policy_params, env_params,
                                      param_space_size, state_space_size,
                                      env_param_space_size)

    # Envs for discrete model estimation
    possible_env_params = [[1.0, 1.0, 0.09], [1.5, 1.0,
                                              0.09], [0.5, 1.0, 0.09],
                           [1.2, 0.8, 0.09], [0.8, 1.2,
                                              0.09], [1.1, 0.9, 0.09],
                           [0.9, 1.1, 0.09], [1.5, 0.5, 0.09]]

    possible_envs = []
    for param in np.array(possible_env_params):
        possible_envs.append(gym.make('LQG1D-v0'))
        possible_envs[-1].setParams(param)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    for estimator in estimators:

        print(estimator)

        model_estimation = 0
        off_policy = 0
        discrete_estimation = 0
        model = None

        # Create a new dataset object
        source_dataset = sc.SourceDataset(*data, n_config_cv)

        if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]:
            name = estimator
        else:
            off_policy = 1
            name = estimator[:-3]

            if estimator.endswith("SR"):
                # Create a fake dataset for the sample-reuse algorithm
                data_sr = stc.sourceTaskCreationSpec(
                    env_src, episode_length, 1, arguments.gamma,
                    variance_action, np.array([[-0.1]]),
                    np.array([[1.0, 1.0, 0.09]]), param_space_size,
                    state_space_size, env_param_space_size)
                source_dataset = sc.SourceDataset(*data_sr, 1)
            elif estimator.endswith("DI"):
                model_estimation = 1
                discrete_estimation = 1
                model = Models(possible_envs)
            elif estimator.endswith("GP") or estimator.endswith(
                    "ES") or estimator.endswith("MI"):
                model_estimation = 1
                model = ModelEstimatorRKHS(
                    kernel_rho=1,
                    kernel_lambda=[1, 1],
                    sigma_env=env_tgt.sigma_noise,
                    sigma_pi=np.sqrt(variance_action),
                    T=episode_length,
                    R=arguments.rkhs_samples,
                    lambda_=0.0,
                    source_envs=source_envs,
                    n_source=n_source,
                    max_gp=arguments.max_gp_samples,
                    state_dim=1,
                    linear_kernel=True,
                    balance_coeff=arguments.balance_coeff,
                    target_env=env_tgt if arguments.print_mse else None)
                if estimator.endswith("GP"):
                    model.use_gp = True
                elif estimator.endswith("MI"):
                    model.use_gp_generate_mixture = True

        result = la.learnPolicy(env_param,
                                simulation_param,
                                source_dataset,
                                name,
                                off_policy=off_policy,
                                model_estimation=model_estimation,
                                dicrete_estimation=discrete_estimation,
                                model_estimator=model,
                                verbose=not arguments.quiet)

        stats[estimator].append(result)

    return stats
Example #7
0
def main(id):

    # General env properties
    env_tgt = gym.make('cartpolec-v0')
    env_src = gym.make('cartpolec-v0')
    param_space_size = 4
    state_space_size = 4
    env_param_space_size = 3
    episode_length = 200
    gaussian_transition = True

    env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transition)

    mean_initial_param = np.random.normal(np.zeros(param_space_size), 0.01)
    variance_initial_param = 0
    variance_action = 0.1

    simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size,
                                          arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min,
                                          "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam)

    # Source tasks
    pis = [[-0.04058811, 0.06820783, 0.09962419, -0.01481458],
           [-0.04327763, 0.01926409, 0.10651812, 0.07304843],
           [-0.04660533, -0.08301117, 0.14598312, 0.31524803],
           [-0.04488895, -0.04959011, 0.20856307, 0.52564195],
           [-0.02085553, 0.11530108, 0.24525215, 0.58338479],
           [-0.03072567, 0.15546779, 0.27241488, 0.65833969],
           [-0.05493752, 0.11100809, 0.30213226, 0.73134919],
           [-0.02389198, 0.18004238, 0.30697023, 0.72447482],
           [-0.0702051, 0.17653729, 0.32254312, 0.72004621],
           [-0.09675066, 0.16063462, 0.32343255, 0.73801456]]

    m = np.random.uniform(0.8, 1.2, arguments.n_source_models)
    l = np.random.uniform(0.4, 0.6, arguments.n_source_models)
    envs = [[m[i], l[i], 0.09] for i in range(m.shape[0])]

    policy_params = []
    env_params = []
    num_policy = len(pis)
    for e in envs:
        for p in pis:
            policy_params.append(p)
            env_params.append(e)

    policy_params = np.array(policy_params)
    env_params = np.array(env_params)

    source_envs = []
    for param in np.array(envs):
        source_envs.append(gym.make('cartpolec-v0'))
        source_envs[-1].setParams(param)
    n_config_cv = policy_params.shape[0]
    n_source = [arguments.n_source_samples*len(pis) for _ in envs]

    data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action,
                                      policy_params, env_params, param_space_size, state_space_size, env_param_space_size)

    # Envs for discrete model estimation
    possible_env_params = [[1.0, 0.5, 0.09],
                           [0.8, 0.3, 0.09],
                           [1.2, 0.7, 0.09],
                           [1.1, 0.6, 0.09],
                           [0.9, 0.4, 0.09],
                           [0.9, 0.6, 0.09],
                           [1.1, 0.4, 0.09],
                           [1.5, 1.0, 0.09]]

    possible_envs = []
    for param in np.array(possible_env_params):
        possible_envs.append(gym.make('cartpolec-v0'))
        possible_envs[-1].setParams(param)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    for estimator in estimators:

        print(estimator)

        model_estimation = 0
        off_policy = 0
        discrete_estimation = 0
        model = None
        env_src_models = None

        # Create a new dataset object
        source_dataset = sc.SourceDataset(*data, n_config_cv)
        source_dataset.policy_per_model = num_policy
        if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]:
            name = estimator
        else:
            off_policy = 1
            name = estimator[:-3]

            if estimator.endswith("SR"):
                # Create a fake dataset for the sample-reuse algorithm
                data_sr = stc.sourceTaskCreationSpec(env_src, episode_length, 1, arguments.gamma, variance_action,
                                                     np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.5, 0.09]]), param_space_size,
                                                     state_space_size, env_param_space_size)
                source_dataset = sc.SourceDataset(*data_sr, 1)
            elif estimator.endswith("DI"):
                model_estimation = 1
                discrete_estimation = 1
                model = Models(possible_envs)
            elif estimator.endswith("GP") or estimator.endswith("ES") or estimator.endswith("MI") or estimator.endswith("NS"):
                model_estimation = 1
                model = ModelEstimatorRKHS(kernel_rho=1, kernel_lambda=[1, 1, 1, 1, 1], sigma_env=env_tgt.sigma_env,
                                           sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples,
                                           lambda_=0.0, source_envs=source_envs, n_source=n_source,
                                           max_gp=arguments.max_gp_samples, state_dim=4, linear_kernel=False,
                                           balance_coeff=arguments.balance_coeff, alpha_gp=1e-5,
                                           target_env=env_tgt if arguments.print_mse else None, id=id)
                if estimator.endswith("GP"):
                    model.use_gp = True
                elif estimator.endswith("MI"):
                    model.use_gp_generate_mixture = True

                if estimator.endswith("NS"):
                    n_models = int(source_dataset.episodes_per_config.shape[0]/source_dataset.policy_per_model)
                    transition_models = []
                    for i in range(n_models):
                        model_estimator = ModelEstimatorRKHS(kernel_rho=1, kernel_lambda=[1, 1, 1, 1, 1], sigma_env=env_tgt.sigma_env,
                                               sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples,
                                               lambda_=0.0, source_envs=source_envs, n_source=n_source,
                                               max_gp=arguments.max_gp_samples_src, state_dim=4, linear_kernel=False,
                                               balance_coeff=arguments.balance_coeff, alpha_gp=1e-5,
                                               target_env=env_tgt if arguments.print_mse else None, id=id)
                        transition_models.append(model_estimator)
                    env_src_models = SourceEstimator(source_dataset, transition_models)
        result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy,
                                model_estimation=model_estimation, dicrete_estimation=discrete_estimation,
                                model_estimator=model, verbose=not arguments.quiet, dump_model=arguments.dump_estimated_model,
                                iteration_dump=arguments.iteration_dump, source_estimator=env_src_models if estimator.endswith("NS") else None)

        stats[estimator].append(result)

    return stats
def main():
    """
    lqg1d sample reuse
    """
    env_tgt = gym.make('LQG1D-v0')
    env_src = gym.make('LQG1D-v0')
    param_space_size = 1
    state_space_size = 1
    env_param_space_size = 3
    episode_length = 20
    gaussian_transitions = True

    env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions)

    mean_initial_param = 0 * np.ones(param_space_size)
    variance_initial_param = 0
    variance_action = 0.1
    batch_size = 10
    discount_factor = 0.99
    ess_min = 25
    adaptive = "No"
    n_min = 3

    simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, batch_size,
                                          num_batch, discount_factor, None, None, ess_min, adaptive, n_min)


    # source task for lqg1d
    source_dataset_batch_size = 1
    discount_factor = 0.99

    pis = [[-0.1]]#, [-0.2], [-0.3], [-0.4], [-0.5], [-0.6], [-0.7], [-0.8]]
    A = np.random.uniform(0.5, 1.5, 1)
    B = np.random.uniform(0.8, 1.2, 1)
    variance_env = 0.09
    envs = []
    for i in range(len(A)):
        envs.append([A[i], B[i], variance_env])

    policy_params = []
    env_params = []

    for p in pis:
        for e in envs:
            policy_params.append(p)
            env_params.append(e)

    policy_params = np.array(policy_params)
    env_params = np.array(env_params)

    n_config_cv = policy_params.shape[0]

    [source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised] = stc.sourceTaskCreationSpec(env_src, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size)

    stats = {}
    for estimator in estimators:
        stats[estimator] = []

    self_normalised = 0

    for estimator,learning_rate in zip(estimators, learning_rates):

        print(estimator)
        simulation_param.learning_rate = learning_rate
        if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]:
            off_policy = 0
            name = estimator
            simulation_param.batch_size = 10
            self_normalised = 0
        elif estimator == "IS-SN":
            self_normalised = 1
            name = estimator[:-3]
            #estimator = estimator[:-3]
            off_policy = 1
        elif estimator.endswith("SR"): #if sample reuse
            source_dataset_batch_size = 1
            discount_factor = 0.99
            policy_params = np.array([[-1]])
            env_params = np.array([[1-5, 1, 0.09]])
            n_config_cv = 1
            name = estimator[:-3]
            self_normalised = 0
            [source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped,
             next_states_unclipped_denoised] = stc.sourceTaskCreationSpec(env_src, episode_length, source_dataset_batch_size,
                                                                          discount_factor, variance_action, policy_params,
                                                                          env_params, param_space_size, state_space_size,
                                                                          env_param_space_size)
        else:
            off_policy = 1
            name = estimator
            self_normalised = 0


        simulation_param.learning_rate = learning_rate
        source_dataset = sc.SourceDataset(source_task, source_param, episodes_per_configuration, next_states_unclipped,
                                          actions_clipped, next_states_unclipped_denoised, n_config_cv)

        simulation_param.learning_rate = learning_rate

        result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, self_normalised=self_normalised)

        stats[estimator].append(result)

    return stats