def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    elif variant['env_name'] == 'ant-rllab':
        env = normalize(AntEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size'])

    base_kwargs = dict(
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=(M, M))

    policy = StochasticNNPolicy(env_spec=spec(env), hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False)

    algorithm.train()
Beispiel #2
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    elif variant['env_name'] == 'ant-rllab':
        env = normalize(AntEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(env_spec=spec(env),
                              max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=(M, M))

    policy = StochasticNNPolicy(env_spec=spec(env), hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False)

    algorithm.train()
def run_experiment(variant):
    if variant['env_name'] == 'pusher':
        # TODO: assumes `pusher.xml` is located in `rllab/models/` when
        # running on EC2.
        env = normalize(PusherEnv(goal=variant.get('goal')))
    else:
        raise ValueError

    pool = SimpleReplayBuffer(
        env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size'])

    base_kwargs = dict(
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        sampler=sampler)

    task_id = abs(pickle.dumps(variant).__hash__())

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=spec(env),
        hidden_layer_sizes=(M, M),
        name='qf_{i}'.format(i=task_id))

    policy = StochasticNNPolicy(
        env_spec=spec(env),
        hidden_layer_sizes=(M, M),
        name='policy_{i}'.format(i=task_id))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=variant['save_full_state'])

    algorithm.train()
def run_experiment(variant):
    if variant['env_name'] == 'pusher':
        # TODO: assumes `pusher.xml` is located in `rllab/models/` when
        # running on EC2.
        env = normalize(PusherEnv(goal=variant.get('goal')))
    else:
        raise ValueError

    pool = SimpleReplayBuffer(env_spec=spec(env),
                              max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    task_id = abs(pickle.dumps(variant).__hash__())

    M = variant['layer_size']
    qf = NNQFunction(env_spec=spec(env),
                     hidden_layer_sizes=(M, M),
                     name='qf_{i}'.format(i=task_id))

    policy = StochasticNNPolicy(env_spec=spec(env),
                                hidden_layer_sizes=(M, M),
                                name='policy_{i}'.format(i=task_id))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=variant['save_full_state'])

    algorithm.train()
    def run_task(snapshot_config, *_):

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            with tf.variable_scope('AST', reuse=tf.AUTO_REUSE):

                with LocalTFRunner(
                        snapshot_config=snapshot_config, max_cpus=4, sess=sess) as local_runner:
                    # Instantiate the example classes
                    sim = ExampleAVSimulator(**sim_args)
                    reward_function = ExampleAVReward(**reward_args)
                    spaces = ExampleAVSpaces(**spaces_args)

                    # Create the environment
                    if 'id' in env_args:
                        env_args.pop('id')
                    env = TfEnv(normalize(ASTEnv(simulator=sim,
                                                 reward_function=reward_function,
                                                 spaces=spaces,
                                                 **env_args
                                                 )))

                    # Instantiate the garage objects
                    policy = GaussianLSTMPolicy(env_spec=env.spec, **policy_args)

                    baseline = LinearFeatureBaseline(env_spec=env.spec, **baseline_args)

                    optimizer = ConjugateGradientOptimizer
                    optimizer_args = {'hvp_approach': FiniteDifferenceHvp(base_eps=1e-5)}

                    algo = PPO(env_spec=env.spec,
                               policy=policy,
                               baseline=baseline,
                               optimizer=optimizer,
                               optimizer_args=optimizer_args,
                               **algo_args)

                    sampler_cls = ASTVectorizedSampler

                    local_runner.setup(
                        algo=algo,
                        env=env,
                        sampler_cls=sampler_cls,
                        sampler_args={"open_loop": False,
                                      "sim": sim,
                                      "reward_function": reward_function,
                                      'n_envs': n_parallel})

                    # Run the experiment
                    local_runner.train(**runner_args)
Beispiel #6
0
def test():

    env = normalize(MultiGoalEnv())

    pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=1e6)

    sampler = SimpleSampler(max_path_length=30,
                            min_pool_size=100,
                            batch_size=64)

    base_kwargs = {
        'sampler': sampler,
        'epoch_length': 100,
        'n_epochs': 1000,
        'n_train_repeat': 1,
        'eval_render': True,
        'eval_n_episodes': 10
    }

    M = 128
    policy = StochasticNNPolicy(spec(env),
                                hidden_layer_sizes=(M, M),
                                squash=True)

    qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=[M, M])

    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)

    algorithm = SQL(base_kwargs=base_kwargs,
                    env=env,
                    pool=pool,
                    qf=qf,
                    policy=policy,
                    plotter=plotter,
                    policy_lr=3e-4,
                    qf_lr=3e-4,
                    value_n_particles=16,
                    td_target_update_interval=1000,
                    kernel_fn=adaptive_isotropic_gaussian_kernel,
                    kernel_n_particles=32,
                    kernel_update_ratio=0.5,
                    discount=0.99,
                    reward_scale=0.1,
                    save_full_state=False)

    algorithm.train()
Beispiel #7
0
def test():

    env = normalize(MultiGoalEnv())

    pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=1e6)

    sampler = SimpleSampler(
        max_path_length=30, min_pool_size=100, batch_size=64)

    base_kwargs = {
        'sampler': sampler,
        'epoch_length': 100,
        'n_epochs': 1000,
        'n_train_repeat': 1,
        'eval_render': True,
        'eval_n_episodes': 10
    }

    M = 128
    policy = StochasticNNPolicy(
        spec(env), hidden_layer_sizes=(M, M), squash=True)

    qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=[M, M])

    plotter = QFPolicyPlotter(
        qf=qf,
        policy=policy,
        obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]),
        default_action=[np.nan, np.nan],
        n_samples=100)

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        plotter=plotter,
        policy_lr=3e-4,
        qf_lr=3e-4,
        value_n_particles=16,
        td_target_update_interval=1000,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=32,
        kernel_update_ratio=0.5,
        discount=0.99,
        reward_scale=0.1,
        save_full_state=False)

    algorithm.train()
def run_experiment(variant):
    env = normalize(SwimmerEnv())

    pool = SimpleReplayBuffer(
        env_spec=spec(env), max_replay_buffer_size=1e6)

    sampler = SimpleSampler(
        max_path_length=1000,
        min_pool_size=1000,
        batch_size=128)

    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=500,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        sampler=sampler)

    with tf.Session().as_default():
        data = joblib.load(variant['file'])
        if 'algo' in data.keys():
            saved_qf = data['algo'].qf
            saved_policy = data['algo'].policy
        else:
            saved_qf = data['qf']
            saved_policy = data['policy']

        algorithm = SQL(
            base_kwargs=base_kwargs,
            env=env,
            pool=pool,
            qf=saved_qf,
            policy=saved_policy,
            kernel_fn=adaptive_isotropic_gaussian_kernel,
            kernel_n_particles=16,
            kernel_update_ratio=0.5,
            value_n_particles=16,
            td_target_update_interval=1000,
            qf_lr=3E-4,
            policy_lr=3E-4,
            discount=0.99,
            reward_scale=30,
            use_saved_qf=True,
            use_saved_policy=True,
            save_full_state=False)

        algorithm.train()
Beispiel #9
0
def run_task(snapshot_config, *_):

    with LocalTFRunner(snapshot_config=snapshot_config, max_cpus=1) as runner:

        # Instantiate the example classes
        sim = ExampleAVSimulator()
        reward_function = ExampleAVReward()
        spaces = ExampleAVSpaces()

        # Create the environment
        env = TfEnv(
            normalize(
                ASTEnv(blackbox_sim_state=True,
                       fixed_init_state=True,
                       s_0=[-0.5, -4.0, 1.0, 11.17, -35.0],
                       simulator=sim,
                       reward_function=reward_function,
                       spaces=spaces)))

        # Instantiate the garage objects
        policy = GaussianLSTMPolicy(name='lstm_policy',
                                    env_spec=env.spec,
                                    hidden_dim=64)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=max_path_length,
                    discount=0.99,
                    kl_constraint='soft',
                    max_kl_step=0.01)

        sampler_cls = ASTVectorizedSampler

        runner.setup(algo=algo,
                     env=env,
                     sampler_cls=sampler_cls,
                     sampler_args={
                         "sim": sim,
                         "reward_function": reward_function
                     })

        runner.train(n_epochs=1, batch_size=4000, plot=False)

        print("Installation successfully validated")
Beispiel #10
0
def run_experiment(variant):
    env = normalize(SwimmerEnv())

    pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=1e6)

    sampler = SimpleSampler(max_path_length=1000,
                            min_pool_size=1000,
                            batch_size=128)

    base_kwargs = dict(epoch_length=1000,
                       n_epochs=500,
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    with tf.Session().as_default():
        data = joblib.load(variant['file'])
        if 'algo' in data.keys():
            saved_qf = data['algo'].qf
            saved_policy = data['algo'].policy
        else:
            saved_qf = data['qf']
            saved_policy = data['policy']

        algorithm = SQL(base_kwargs=base_kwargs,
                        env=env,
                        pool=pool,
                        qf=saved_qf,
                        policy=saved_policy,
                        kernel_fn=adaptive_isotropic_gaussian_kernel,
                        kernel_n_particles=16,
                        kernel_update_ratio=0.5,
                        value_n_particles=16,
                        td_target_update_interval=1000,
                        qf_lr=3E-4,
                        policy_lr=3E-4,
                        discount=0.99,
                        reward_scale=30,
                        use_saved_qf=True,
                        use_saved_policy=True,
                        save_full_state=False)

        algorithm.train()
Beispiel #11
0
def run_experiment(variant):
    env = normalize(PusherEnv(goal=variant.get('goal')))

    buffer1, qf1 = load_buffer_and_qf(variant['snapshot1'])
    buffer2, qf2 = load_buffer_and_qf(variant['snapshot2'])

    sampler = DummySampler(
        batch_size=variant['batch_size'],
        max_path_length=variant['max_path_length'])
    buffer = UnionBuffer(buffers=(buffer1, buffer2))

    qf = SumQFunction(spec(env), q_functions=(qf1, qf2))

    M = variant['layer_size']
    policy = StochasticNNPolicy(
        env_spec=spec(env),
        hidden_layer_sizes=(M, M),
        name='policy{i}'.format(i=0))

    base_kwargs = dict(
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        sampler=sampler)

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=buffer,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        policy_lr=variant['policy_lr'],
        save_full_state=False,
        train_policy=True,
        train_qf=False,
        use_saved_qf=True)

    algorithm.train()
Beispiel #12
0
def run_experiment(variant):
    env = normalize(PusherEnv(goal=variant.get('goal')))

    buffer1, qf1 = load_buffer_and_qf(variant['snapshot1'])
    buffer2, qf2 = load_buffer_and_qf(variant['snapshot2'])

    sampler = DummySampler(batch_size=variant['batch_size'],
                           max_path_length=variant['max_path_length'])
    buffer = UnionBuffer(buffers=(buffer1, buffer2))

    qf = SumQFunction(spec(env), q_functions=(qf1, qf2))

    M = variant['layer_size']
    policy = StochasticNNPolicy(env_spec=spec(env),
                                hidden_layer_sizes=(M, M),
                                name='policy{i}'.format(i=0))

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    algorithm = SQL(base_kwargs=base_kwargs,
                    env=env,
                    pool=buffer,
                    qf=qf,
                    policy=policy,
                    kernel_fn=adaptive_isotropic_gaussian_kernel,
                    kernel_n_particles=variant['kernel_particles'],
                    kernel_update_ratio=variant['kernel_update_ratio'],
                    policy_lr=variant['policy_lr'],
                    save_full_state=False,
                    train_policy=True,
                    train_qf=False,
                    use_saved_qf=True)

    algorithm.train()
Beispiel #13
0
 def test_can_create_env(self):
     # Fixes https://github.com/rlworkgroup/garage/pull/420
     env = normalize(SwimmerEnv())
     assert env
    def run_task(snapshot_config, *_):

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            with tf.variable_scope('AST', reuse=tf.AUTO_REUSE):

                with LocalTFRunner(snapshot_config=snapshot_config,
                                   max_cpus=4,
                                   sess=sess) as local_runner:
                    # Instantiate the example classes
                    sim = ExampleAVSimulator(**sim_args)
                    reward_function = ExampleAVReward(**reward_args)
                    spaces = ExampleAVSpaces(**spaces_args)

                    # Create the environment
                    if 'id' in env_args:
                        env_args.pop('id')
                    env = TfEnv(
                        normalize(
                            ASTEnv(simulator=sim,
                                   reward_function=reward_function,
                                   spaces=spaces,
                                   **env_args)))

                    # Instantiate the garage objects
                    policy = GaussianLSTMPolicy(env_spec=env.spec,
                                                **policy_args)

                    baseline = LinearFeatureBaseline(env_spec=env.spec,
                                                     **baseline_args)

                    optimizer = ConjugateGradientOptimizer
                    optimizer_args = {
                        'hvp_approach': FiniteDifferenceHvp(base_eps=1e-5)
                    }

                    algo = PPO(env_spec=env.spec,
                               policy=policy,
                               baseline=baseline,
                               optimizer=optimizer,
                               optimizer_args=optimizer_args,
                               **algo_args)

                    sampler_cls = ASTVectorizedSampler
                    sampler_args['sim'] = sim
                    sampler_args['reward_function'] = reward_function

                    local_runner.setup(algo=algo,
                                       env=env,
                                       sampler_cls=sampler_cls,
                                       sampler_args=sampler_args)

                    # Run the experiment
                    local_runner.train(**runner_args)

                    if save_expert_trajectory:
                        load_convert_and_save_drl_expert_trajectory(
                            last_iter_filename=os.path.join(
                                run_experiment_args['log_dir'], 'itr_' +
                                str(runner_args['n_epochs'] - 1) + '.pkl'),
                            expert_trajectory_filename=os.path.join(
                                run_experiment_args['log_dir'],
                                'expert_trajectory.pkl'))

                    print('done!')
    def run_task(snapshot_config, *_):

        config = tf.ConfigProto(device_count={'GPU': 0})
        # config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            with tf.variable_scope('AST', reuse=tf.AUTO_REUSE):

                with LocalTFRunner(snapshot_config=snapshot_config,
                                   sess=sess) as runner:

                    # Instantiate the example classes
                    # sim = ExampleAVSimulator()
                    g = 9.8  # acceleration due to gravity

                    # this is y
                    lat_params = rss.LateralParams(
                        0,  # ρ
                        0.1 * g,  # a_lat_max_acc
                        0.05 * g,  # a_lat_min_brake
                        1.4  # Buffer distance
                    )

                    # this is x
                    long_params = rss.LongitudinalParams(
                        0,  # ρ
                        0.7 * g,  # a_max_brake
                        0.1 * g,  # a_max_acc
                        0.7 * g,  # a_min_brake1
                        0.7 * g,  # a_min_brake2
                        2.5,  # Buffer
                    )
                    sim = AVRSSSimulator(lat_params, long_params)
                    reward_function = HeuristicReward(
                        PedestrianNoiseGaussian(1, 1, 0.2, .01),
                        np.array([-10000, -1000, 0]))
                    # reward_function = ExampleAVReward()
                    spaces = ExampleAVSpaces()

                    # Create the environment
                    # env1 = GoExploreASTEnv(open_loop=False,
                    #                              blackbox_sim_state=True,
                    #                              fixed_init_state=True,
                    #                              s_0=[-0.5, -4.0, 1.0, 11.17, -35.0],
                    #                              simulator=sim,
                    #                              reward_function=reward_function,
                    #                              spaces=spaces
                    s_0 = [-1.0, -2.0, 1.0, 11.17, -35.0]
                    #                              )
                    env1 = gym.make('ast_toolbox:GoExploreAST-v1',
                                    open_loop=False,
                                    action_only=True,
                                    fixed_init_state=True,
                                    s_0=s_0,
                                    simulator=sim,
                                    reward_function=reward_function,
                                    spaces=spaces)
                    env2 = normalize(env1)
                    env = TfEnv(env2)

                    # Instantiate the garage objects
                    policy = GoExplorePolicy(env_spec=env.spec)

                    baseline = LinearFeatureBaseline(env_spec=env.spec)

                    algo = GoExplore(
                        db_filename=db_filename,
                        max_db_size=max_db_size,
                        env=env,
                        env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=max_path_length,
                        discount=discount,
                        # whole_paths=whole_paths
                    )

                    sampler_cls = BatchSampler
                    sampler_args = {'n_envs': n_parallel}

                    runner.setup(algo=algo,
                                 env=env,
                                 sampler_cls=sampler_cls,
                                 sampler_args=sampler_args)

                    # runner.setup(
                    #     algo=algo,
                    #     env=env,
                    #     sampler_cls=sampler_cls,
                    #     sampler_args={"sim": sim,
                    #                   "reward_function": reward_function})

                    # Run the experiment
                    paths = runner.train(n_epochs=n_itr,
                                         batch_size=batch_size,
                                         plot=False)
                    print(paths)
                    best_traj = paths.trajectory * np.array([
                        1, 1 / 1000, 1 / 1000, 1 / 1000, 1 / 1000, 1 / 1000,
                        1 / 1000
                    ])
                    peds = sim._peds
                    car = np.expand_dims(sim._car, axis=0)
                    car_obs = sim._car_obs
                    for step in range(best_traj.shape[0]):
                        sim.step(action=best_traj[step, 1:], open_loop=False)
                        peds = np.concatenate((peds, sim._peds), axis=0)
                        car = np.concatenate(
                            (car, np.expand_dims(sim._car, axis=0)), axis=0)
                        car_obs = np.concatenate((car_obs, sim._car_obs),
                                                 axis=0)

                    import matplotlib.pyplot as plt
                    plt.scatter(car[:, 2], car[:, 3])
                    plt.scatter(peds[:, 2], peds[:, 3])
                    plt.scatter(car_obs[:, 2], car_obs[:, 3])
                    pdb.set_trace()
                    print('done!')
Beispiel #16
0
    def run_task(snapshot_config, *_):

        config = tf.ConfigProto(device_count={'GPU': 0})
        # config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            with tf.variable_scope('AST', reuse=tf.AUTO_REUSE):

                # Instantiate the example classes
                sim = ExampleAVSimulator(**sim_args)
                reward_function = ExampleAVReward(**reward_args)
                spaces = ExampleAVSpaces(**spaces_args)

                # Create the environment
                # env1 = GoExploreASTEnv(open_loop=False,
                #                              blackbox_sim_state=True,
                #                              fixed_init_state=True,
                #                              s_0=[-0.5, -4.0, 1.0, 11.17, -35.0],
                #                              simulator=sim,
                #                              reward_function=reward_function,
                #                              spaces=spaces
                #                              )
                env1 = gym.make(id=env_args.pop('id'),
                                simulator=sim,
                                reward_function=reward_function,
                                spaces=spaces,
                                **env_args)
                env2 = normalize(env1)
                env = TfEnv(env2)

                sampler_cls = BatchSampler
                # sampler_args = {'n_envs': n_parallel}
                sampler_args = {}
                # expert_trajectory_file = log_dir + '/expert_trajectory.p'
                # with open(expert_trajectory_file, 'rb') as f:
                #     expert_trajectory = pickle.load(f)

                #
                # #Run backwards algorithm to robustify
                with LocalTFRunner(snapshot_config=snapshot_config,
                                   sess=sess) as local_runner:

                    policy = GaussianLSTMPolicy(env_spec=env.spec,
                                                **policy_args)
                    # name='lstm_policy',
                    # env_spec=env.spec,
                    # hidden_dim=64,
                    # use_peepholes=True)

                    baseline = LinearFeatureBaseline(env_spec=env.spec,
                                                     **baseline_args)

                    optimizer = ConjugateGradientOptimizer
                    optimizer_args = {
                        'hvp_approach': FiniteDifferenceHvp(base_eps=1e-5)
                    }

                    algo = BackwardAlgorithm(env=env,
                                             env_spec=env.spec,
                                             policy=policy,
                                             baseline=baseline,
                                             optimizer=optimizer,
                                             optimizer_args=optimizer_args,
                                             **algo_args)
                    # expert_trajectory=expert_trajectory[-1],
                    # epochs_per_step = 10,
                    # scope=None,
                    # max_path_length=max_path_length,
                    # discount=discount,
                    # gae_lambda=1,
                    # center_adv=True,
                    # positive_adv=False,
                    # fixed_horizon=False,
                    # pg_loss='surrogate_clip',
                    # lr_clip_range=1.0,
                    # max_kl_step=1.0,

                    # policy_ent_coeff=0.0,
                    # use_softplus_entropy=False,
                    # use_neg_logli_entropy=False,
                    # stop_entropy_gradient=False,
                    # entropy_method='no_entropy',
                    # name='PPO',
                    # )

                    local_runner.setup(algo=algo,
                                       env=env,
                                       sampler_cls=sampler_cls,
                                       sampler_args=sampler_args)

                    results = local_runner.train(**runner_args)
                    # pdb.set_trace()
                    print('done')
                    log_dir = run_experiment_args['log_dir']
                    with open(log_dir + '/paths.gz', 'wb') as f:
                        try:
                            compress_pickle.dump(results,
                                                 f,
                                                 compression="gzip",
                                                 set_default_extension=False)
                        except MemoryError:
                            print('1')
                            # pdb.set_trace()
                            for idx, result in enumerate(results):
                                with open(
                                        log_dir + '/path_' + str(idx) + '.gz',
                                        'wb') as ff:
                                    try:
                                        compress_pickle.dump(
                                            result,
                                            ff,
                                            compression="gzip",
                                            set_default_extension=False)
                                    except MemoryError:
                                        print('2')
    def run_task(snapshot_config, *_):

        config = tf.ConfigProto(device_count={'GPU': 0})
        # config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            with tf.variable_scope('AST', reuse=tf.AUTO_REUSE):
                # Instantiate the example classes
                sim = ExampleAVSimulator(**sim_args)
                # blackbox_sim_state=True,
                # open_loop=False,
                # fixed_initial_state=True,
                # max_path_length=max_path_length)
                reward_function = ExampleAVReward(**reward_args)
                spaces = ExampleAVSpaces(**spaces_args)

                # Create the environment
                # env1 = GoExploreASTEnv(open_loop=False,
                #                              blackbox_sim_state=True,
                #                              fixed_init_state=True,
                #                              s_0=[-0.5, -4.0, 1.0, 11.17, -35.0],
                #                              simulator=sim,
                #                              reward_function=reward_function,
                #                              spaces=spaces
                #                              )
                # env1 = gym.make('ast_toolbox:GoExploreAST-v1',
                #                 blackbox_sim_state=True,
                #                 open_loop=False,
                #                 fixed_init_state=True,
                #                 s_0=s_0,
                #                 simulator=sim,
                #                 reward_function=reward_function,
                #                 spaces=spaces
                #                 )
                env1 = gym.make(id=env_args.pop('id'),
                                simulator=sim,
                                reward_function=reward_function,
                                spaces=spaces,
                                **env_args)
                env2 = normalize(env1)
                env = TfEnv(env2)

                # Instantiate the garage objects
                policy = GoExplorePolicy(env_spec=env.spec)

                baseline = LinearFeatureBaseline(env_spec=env.spec,
                                                 **baseline_args)

                algo = GoExplore(env_spec=env.spec,
                                 env=env,
                                 policy=policy,
                                 baseline=baseline,
                                 **algo_args)
                #     db_filename=db_filename,
                #     max_db_size=max_db_size,
                #     env=env,
                #
                #     policy=policy,
                #     baseline=baseline,
                #     # robust_policy=robust_policy,
                #     # robust_baseline=robust_baseline,
                #     max_path_length=max_path_length,
                #     discount=discount,
                #     save_paths_gap=1,
                #     save_paths_path=log_dir,
                #     # whole_paths=whole_paths
                # )

                sampler_cls = BatchSampler
                # sampler_args = {'n_envs': n_parallel}
                sampler_args = {}

                with LocalTFRunner(snapshot_config=snapshot_config,
                                   sess=sess) as local_runner:
                    local_runner.setup(algo=algo,
                                       env=env,
                                       sampler_cls=sampler_cls,
                                       sampler_args=sampler_args)

                    # local_runner.setup(
                    #     algo=algo,
                    #     env=env,
                    #     sampler_cls=sampler_cls,
                    #     sampler_args={"sim": sim,
                    #                   "reward_function": reward_function})

                    # Run the experiment
                    best_cell = local_runner.train(
                        **runner_args
                    )  # n_epochs=n_itr, batch_size=batch_size, plot=False)

                    log_dir = run_experiment_args['log_dir']
                    db_filename = algo_args['db_filename']
                    s_0 = env_args['s_0']

                    pool_DB = db.DB()
                    pool_DB.open(db_filename + '_pool.dat',
                                 dbname=None,
                                 dbtype=db.DB_HASH,
                                 flags=db.DB_CREATE)
                    d_pool = shelve.Shelf(pool_DB,
                                          protocol=pickle.HIGHEST_PROTOCOL)
                    # pdb.set_trace()
                    print(best_cell)
                    temp = best_cell
                    paths = []
                    while (temp.parent is not None):
                        print(temp.observation)
                        action = temp.observation[1:].astype(np.float32) / 1000
                        paths.append({
                            'state': temp.state,
                            'reward': temp.reward,
                            'action': action,
                            'observation': np.array(s_0)
                        })
                        temp = d_pool[temp.parent]
                    print(temp.observation)
                    paths.append({
                        'state': temp.state,
                        'reward': temp.reward,
                        'action': action,
                        'observation': np.array(s_0)
                    })
                    # pdb.set_trace()
                    d_pool.close()

                    with open(log_dir + '/expert_trajectory.p', 'wb') as f:
                        pickle.dump([paths], f)
                    print('done!')