Beispiel #1
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    dynamics = ProbabilisticEnsembleDynamics(use_gpu=device)
    dynamics.fit(train_episodes,
                 eval_episodes=test_episodes,
                 n_steps=100000,
                 scorers={
                     "obs_error": dynamics_observation_prediction_error_scorer,
                     "reward_error": dynamics_reward_prediction_error_scorer,
                 })

    combo = COMBO(q_func_factory=args.q_func,
                  dynamics=dynamics,
                  use_gpu=device)

    combo.fit(train_episodes,
              eval_episodes=test_episodes,
              n_steps=1000000,
              scorers={
                  'environment': evaluate_on_environment(env),
                  'td_error': td_error_scorer,
                  'discounted_advantage': discounted_sum_of_advantage_scorer,
                  'value_scale': average_value_estimation_scorer,
                  'value_std': value_estimation_std_scorer,
                  'action_diff': continuous_action_diff_scorer
              })
Beispiel #2
0
def test_evaluate_on_environment(observation_shape, action_size,
                                 episode_length, n_trials):
    shape = (n_trials, episode_length + 1) + observation_shape
    observations = np.random.random(shape)

    class DummyEnv:
        def __init__(self):
            self.episode = 0

        def step(self, action):
            self.t += 1
            observation = observations[self.episode - 1, self.t]
            reward = np.mean(observation) + np.mean(action)
            done = self.t == episode_length
            return observation, reward, done, {}

        def reset(self):
            self.t = 0
            self.episode += 1
            return observations[self.episode - 1, 0]

    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    algo = DummyAlgo(A, 0.0)

    ref_rewards = []
    for i in range(n_trials):
        episode_obs = observations[i, :, :]
        actions = algo.predict(episode_obs[:-1])
        rewards = np.mean(episode_obs[1:], axis=1) + np.mean(actions, axis=1)
        ref_rewards.append(np.sum(rewards))

    mean_reward = evaluate_on_environment(DummyEnv(), n_trials)(algo)
    assert np.allclose(mean_reward, np.mean(ref_rewards))
Beispiel #3
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    encoder_factory = VectorEncoderFactory(hidden_units=[256, 256, 256, 256])

    awac = AWAC(actor_encoder_factory=encoder_factory,
                critic_encoder_factory=encoder_factory,
                q_func_factory=args.q_func,
                use_gpu=device)

    awac.fit(train_episodes,
             eval_episodes=test_episodes,
             n_epochs=1000,
             scorers={
                 'environment': evaluate_on_environment(env),
                 'td_error': td_error_scorer,
                 'discounted_advantage': discounted_sum_of_advantage_scorer,
                 'value_scale': average_value_estimation_scorer,
                 'value_std': value_estimation_std_scorer,
                 'action_diff': continuous_action_diff_scorer
             })
Beispiel #4
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    bc = DiscreteBC(
        n_frames=4,  # frame stacking
        scaler='pixel',
        use_gpu=args.gpu)

    bc.fit(train_episodes,
           eval_episodes=test_episodes,
           n_epochs=100,
           scorers={'environment': evaluate_on_environment(env, epsilon=0.05)})
Beispiel #5
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    bc = DiscreteBC(n_epochs=100,
                    scaler='pixel',
                    use_batch_norm=False,
                    use_gpu=device)

    bc.fit(train_episodes,
           eval_episodes=test_episodes,
           scorers={'environment': evaluate_on_environment(env, epsilon=0.05)})
Beispiel #6
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    bc = BC(n_epochs=100, use_gpu=device)

    bc.fit(train_episodes,
           eval_episodes=test_episodes,
           scorers={
               'environment': evaluate_on_environment(env),
               'action_diff': continuous_action_diff_scorer
           })
Beispiel #7
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    awr = AWR(n_epochs=100, use_gpu=device)

    awr.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env),
                'td_error': td_error_scorer,
                'value_scale': average_value_estimation_scorer,
                'action_diff': continuous_action_diff_scorer
            })
Beispiel #8
0
def test_evaluate_on_environment(
    observation_shape, action_size, episode_length, n_trials
):
    shape = (n_trials, episode_length + 1) + observation_shape
    if len(observation_shape) == 3:
        observations = np.random.randint(0, 255, size=shape, dtype=np.uint8)
    else:
        observations = np.random.random(shape).astype("f4")

    class DummyEnv:
        def __init__(self):
            self.episode = 0
            self.observation_space = spaces.Box(
                low=0, high=255, shape=observation_shape
            )

        def step(self, action):
            self.t += 1
            observation = observations[self.episode - 1, self.t]
            reward = np.mean(observation) + np.mean(action)
            done = self.t == episode_length
            return observation, reward, done, {}

        def reset(self):
            self.t = 0
            self.episode += 1
            return observations[self.episode - 1, 0]

    # projection matrix for deterministic action
    feature_size = reduce(mul, observation_shape)
    A = np.random.random((feature_size, action_size))
    algo = DummyAlgo(A, 0.0)

    ref_rewards = []
    for i in range(n_trials):
        episode_obs = observations[i].reshape((-1, feature_size))
        actions = algo.predict(episode_obs[:-1])
        rewards = np.mean(episode_obs[1:], axis=1) + np.mean(actions, axis=1)
        ref_rewards.append(np.sum(rewards))

    mean_reward = evaluate_on_environment(DummyEnv(), n_trials)(algo)
    assert np.allclose(mean_reward, np.mean(ref_rewards))
Beispiel #9
0
def main(args):
    dataset, env = get_pybullet(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    sac = SAC(n_epochs=100, q_func_type=args.q_func_type, use_gpu=device)

    sac.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
                'value_std': value_estimation_std_scorer,
                'action_diff': continuous_action_diff_scorer
            })
Beispiel #10
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    dqn = DQN(
        n_frames=4,  # frame stacking
        q_func_type=args.q_func_type,
        scaler='pixel',
        use_gpu=args.gpu)

    dqn.fit(train_episodes,
            eval_episodes=test_episodes,
            n_epochs=100,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer
            })
Beispiel #11
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    dqn = DQN(n_epochs=100,
              q_func_type=args.q_func_type,
              scaler='pixel',
              use_batch_norm=False,
              use_gpu=device)

    dqn.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer
            })
def train(params):
    # setup algorithm
    if pretrain:

        dqn = DQN(batch_size=params.get("batch_size"),
                  learning_rate=params.get("learning_rate"),
                  target_update_interval=params.get("target_update_interval"),
                  q_func_factory=QRQFunctionFactory(
                      n_quantiles=params.get("n_quantiles")),
                  n_steps=params.get("train_freq"),
                  gamma=params.get("gamma"),
                  n_critics=1,
                  target_reduction_type="min",
                  use_gpu=True)

        # setup replay buffer
        buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env)

        # setup explorers
        explorer = LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=params.get("exploration_final_eps"),
            duration=100000)

        # start training
        dqn.fit_online(
            env,
            buffer,
            n_steps=params.get("train_steps"),
            explorer=
            explorer,  # you don't need this with probablistic policy algorithms
            tensorboard_dir=log_dir,
            eval_env=eval_env)

        print("Saving Model")
        dqn.save_model(exp_name)

        print("convert buffer to dataset")
        dataset = buffer.to_mdp_dataset()
        # save MDPDataset
        dataset.dump('{0}.h5'.format(exp_name))

    print("Loading Dataset for Offline Training")
    dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name))
    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
    # The dataset can then be used to train a d3rlpy model

    cql = DiscreteCQL(learning_rate=6.25e-05,
                      encoder_factory='default',
                      q_func_factory='mean',
                      batch_size=32,
                      n_frames=1,
                      n_steps=1,
                      gamma=0.99,
                      n_critics=1,
                      bootstrap=False,
                      share_encoder=False,
                      target_reduction_type='min',
                      target_update_interval=8000,
                      use_gpu=True,
                      scaler=None,
                      augmentation=None,
                      generator=None,
                      impl=None)

    cql_exp = params.get("model_name") + "_offline_" + params.get(
        "environment")
    cql_log = '../../../logs/' + cql_exp

    cql.fit(dataset.episodes,
            eval_episodes=test_episodes,
            n_epochs=1000,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
            },
            tensorboard_dir=cql_log)

    cql.save_model(cql_exp)
Beispiel #13
0
from d3rlpy.algos import CQL
from d3rlpy.ope import FQE
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer
from d3rlpy.metrics.scorer import soft_opc_scorer

dataset, env = get_pybullet('hopper-bullet-mixed-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = CQL(n_epochs=100, use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(600)
        })

# or load the trained model
# cql = CQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
fqe = FQE(algo=cql,
          n_epochs=200,
          q_func_factory='qr',
          learning_rate=1e-4,
          use_gpu=True,
          encoder_params={'hidden_units': [1024, 1024, 1024, 1024]})
Beispiel #14
0
from d3rlpy.metrics.scorer import soft_opc_scorer

dataset, env = get_atari('breakout-expert-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = DiscreteCQL(n_epochs=100,
                  scaler='pixel',
                  q_func_factory='qr',
                  n_frames=4,
                  use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env, epsilon=0.05),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(70)
        })

# or load the trained model
# cql = DiscreteCQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
fqe = DiscreteFQE(algo=cql,
                  n_epochs=100,
                  q_func_factory='qr',
                  learning_rate=1e-4,
                  scaler='pixel',
                  n_frames=4,
Beispiel #15
0
from d3rlpy.algos import DQN
from d3rlpy.datasets import get_cartpole
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.context import parallel
from sklearn.model_selection import GridSearchCV

# obtain dataset
dataset, env = get_cartpole()

# setup algowithm with GPU enabled
dqn = DQN(use_gpu=True)

# grid search with multiple GPUs assigned to individual processs
with parallel():
    env_score = evaluate_on_environment(env)
    gscv = GridSearchCV(estimator=dqn,
                        param_grid={
                            'learning_rate': [1e-3, 3e-4, 1e-4],
                            'gamma': [0.99, 0.95, 0.9]
                        },
                        scoring={'environment': env_score},
                        refit=False,
                        n_jobs=3)
    gscv.fit(dataset.episodes, n_epochs=1, show_progress=False)

print(gscv.grid_scores_)
Beispiel #16
0
from d3rlpy.algos import DQN
from d3rlpy.datasets import get_cartpole
from d3rlpy.metrics.scorer import evaluate_on_environment

# obtain dataset
dataset, env = get_cartpole()

# setup algorithm
dqn = DQN(n_epochs=1)

# train
dqn.fit(dataset.episodes)

# evaluate trained algorithm
evaluate_on_environment(env, render=True)(dqn)