Exemple #1
0
def run_experiment(variant, reporter):
    env = get_environment('gym', 'MultiGoal', 'Default', {
        'actuation_cost_coeff': 1,
        'distance_cost_coeff': 0.1,
        'goal_reward': 1,
        'init_sigma': 0.1,
    })

    pool = SimpleReplayPool(
        observation_space=env.observation_space,
        action_space=env.action_space,
        max_size=1e6)

    sampler = SimpleSampler(
        max_path_length=30, min_pool_size=100, batch_size=64)

    Qs = get_Q_function_from_variant(variant, env)
    policy = get_policy_from_variant(variant, env, Qs)
    plotter = QFPolicyPlotter(
        Q=Qs[0],
        policy=policy,
        obs_lst=np.array(((-2.5, 0.0),
                          (0.0, 0.0),
                          (2.5, 2.5),
                          (-2.5, -2.5))),
        default_action=(np.nan, np.nan),
        n_samples=100)

    algorithm = SAC(
        sampler=sampler,
        reparameterize=True,
        epoch_length=100,
        n_epochs=1000,
        n_train_repeat=1,
        eval_render_mode=None,
        eval_n_episodes=10,
        eval_deterministic=False,

        env=env,
        policy=policy,
        initial_exploration_policy=None,
        pool=pool,
        Qs=Qs,
        plotter=plotter,

        lr=3e-4,
        target_entropy=-2.0,
        discount=0.99,
        tau=1e-4,

        save_full_state=True,
    )

    initialize_tf_variables(algorithm._session, only_uninitialized=True)

    for train_result in algorithm.train():
        reporter(**train_result)
Exemple #2
0
def run_experiment(variant, reporter):
    training_environment = (get_environment(
        'gym', 'MultiGoal', 'Default-v0', {
            'actuation_cost_coeff': 30,
            'distance_cost_coeff': 1,
            'goal_reward': 10,
            'init_sigma': 0.1,
        }))
    evaluation_environment = training_environment.copy()

    pool = SimpleReplayPool(environment=training_environment, max_size=1e6)

    sampler = SimpleSampler(max_path_length=30)

    variant['Q_params']['config'].update({
        'input_shapes': (
            training_environment.observation_shape,
            training_environment.action_shape,
        )
    })
    Qs = value_functions.get(variant['Q_params'])

    variant['policy_params']['config'].update({
        'action_range': (training_environment.action_space.low,
                         training_environment.action_space.high),
        'input_shapes':
        training_environment.observation_shape,
        'output_shape':
        training_environment.action_shape,
    })
    policy = policies.get(variant['policy_params'])

    plotter = QFPolicyPlotter(Q=Qs[0],
                              policy=policy,
                              obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0),
                                                (2.5, 2.5), (-2.5, -2.5))),
                              default_action=(np.nan, np.nan),
                              n_samples=100)

    variant['algorithm_params']['config'].update({
        'training_environment': training_environment,
        'evaluation_environment': evaluation_environment,
        'policy': policy,
        'Qs': Qs,
        'pool': pool,
        'sampler': sampler,
        'min_pool_size': 100,
        'batch_size': 64,
        'plotter': plotter,
    })
    algorithm = algorithms.get(variant['algorithm_params'])

    for train_result in algorithm.train():
        reporter(**train_result)
Exemple #3
0
def run_experiment(variant, reporter):
    training_environment = (
        get_environment('gym', 'MultiGoal', 'Default-v0', {
            'actuation_cost_coeff': 30,
            'distance_cost_coeff': 1,
            'goal_reward': 10,
            'init_sigma': 0.1,
        }))
    evaluation_environment = training_environment.copy()

    pool = SimpleReplayPool(
        environment=training_environment,
        max_size=1e6)

    sampler = SimpleSampler(max_path_length=30)

    Qs = get_Q_function_from_variant(variant, training_environment)
    policy = get_policy_from_variant(variant, training_environment)
    plotter = QFPolicyPlotter(
        Q=Qs[0],
        policy=policy,
        obs_lst=np.array(((-2.5, 0.0),
                          (0.0, 0.0),
                          (2.5, 2.5),
                          (-2.5, -2.5))),
        default_action=(np.nan, np.nan),
        n_samples=100)

    algorithm = get_algorithm_from_variant(
        variant=variant,
        training_environment=training_environment,
        evaluation_environment=evaluation_environment,
        policy=policy,
        Qs=Qs,
        pool=pool,
        sampler=sampler,
        min_pool_size=100,
        batch_size=46,
        plotter=plotter,
    )

    initialize_tf_variables(algorithm._session, only_uninitialized=True)

    for train_result in algorithm.train():
        reporter(**train_result)