コード例 #1
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    cql = DiscreteCQL(
        n_frames=4,  # frame stacking
        q_func_type=args.q_func_type,
        scaler='pixel',
        use_gpu=args.gpu)

    cql.fit(train_episodes,
            eval_episodes=test_episodes,
            n_epochs=100,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer
            })
コード例 #2
0
def main(args):
    dataset, env = get_atari(args.dataset)

    d3rlpy.seed(args.seed)

    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

    device = None if args.gpu is None else Device(args.gpu)

    cql = DiscreteCQL(n_epochs=100,
                      q_func_type=args.q_func_type,
                      scaler='pixel',
                      use_batch_norm=False,
                      use_gpu=device)

    cql.fit(train_episodes,
            eval_episodes=test_episodes,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer
            })
コード例 #3
0
    def save_policy(self, path, epoch, as_onnx):
        params_path = os.path.join(self.get_log_path(), 'params.json')
        model_path = os.path.join(self.get_log_path(), 'model_%d.pt' % epoch)

        if not os.path.exists(model_path):
            raise ValueError('%s does not exist.' % model_path)

        # initialize algorithm from json file
        if self.project.algorithm == 'cql':
            if self.project.dataset.is_discrete:
                algo = DiscreteCQL.from_json(params_path)
            else:
                algo = CQL.from_json(params_path)
        else:
            raise ValueError('unsupported algorithm.')

        # load model parameters
        algo.load_model(model_path)

        # save TorchScript policy
        algo.save_policy(path, as_onnx)
コード例 #4
0
def train(params):
    # setup algorithm
    if pretrain:

        dqn = DQN(batch_size=params.get("batch_size"),
                  learning_rate=params.get("learning_rate"),
                  target_update_interval=params.get("target_update_interval"),
                  q_func_factory=QRQFunctionFactory(
                      n_quantiles=params.get("n_quantiles")),
                  n_steps=params.get("train_freq"),
                  gamma=params.get("gamma"),
                  n_critics=1,
                  target_reduction_type="min",
                  use_gpu=True)

        # setup replay buffer
        buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env)

        # setup explorers
        explorer = LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=params.get("exploration_final_eps"),
            duration=100000)

        # start training
        dqn.fit_online(
            env,
            buffer,
            n_steps=params.get("train_steps"),
            explorer=
            explorer,  # you don't need this with probablistic policy algorithms
            tensorboard_dir=log_dir,
            eval_env=eval_env)

        print("Saving Model")
        dqn.save_model(exp_name)

        print("convert buffer to dataset")
        dataset = buffer.to_mdp_dataset()
        # save MDPDataset
        dataset.dump('{0}.h5'.format(exp_name))

    print("Loading Dataset for Offline Training")
    dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name))
    train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
    # The dataset can then be used to train a d3rlpy model

    cql = DiscreteCQL(learning_rate=6.25e-05,
                      encoder_factory='default',
                      q_func_factory='mean',
                      batch_size=32,
                      n_frames=1,
                      n_steps=1,
                      gamma=0.99,
                      n_critics=1,
                      bootstrap=False,
                      share_encoder=False,
                      target_reduction_type='min',
                      target_update_interval=8000,
                      use_gpu=True,
                      scaler=None,
                      augmentation=None,
                      generator=None,
                      impl=None)

    cql_exp = params.get("model_name") + "_offline_" + params.get(
        "environment")
    cql_log = '../../../logs/' + cql_exp

    cql.fit(dataset.episodes,
            eval_episodes=test_episodes,
            n_epochs=1000,
            scorers={
                'environment': evaluate_on_environment(env, epsilon=0.05),
                'td_error': td_error_scorer,
                'discounted_advantage': discounted_sum_of_advantage_scorer,
                'value_scale': average_value_estimation_scorer,
            },
            tensorboard_dir=cql_log)

    cql.save_model(cql_exp)
コード例 #5
0
ファイル: fqe_atari.py プロジェクト: wx-b/d3rlpy
from sklearn.model_selection import train_test_split
from d3rlpy.datasets import get_atari
from d3rlpy.algos import DiscreteCQL
from d3rlpy.ope import DiscreteFQE
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer
from d3rlpy.metrics.scorer import soft_opc_scorer

dataset, env = get_atari('breakout-expert-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

# train algorithm
cql = DiscreteCQL(n_epochs=100,
                  scaler='pixel',
                  q_func_factory='qr',
                  n_frames=4,
                  use_gpu=True)
cql.fit(train_episodes,
        eval_episodes=test_episodes,
        scorers={
            'environment': evaluate_on_environment(env, epsilon=0.05),
            'init_value': initial_state_value_estimation_scorer,
            'soft_opc': soft_opc_scorer(70)
        })

# or load the trained model
# cql = DiscreteCQL.from_json('<path-to-json>/params.json')
# cql.load_model('<path-to-model>/model.pt')

# evaluate the trained policy
コード例 #6
0
ファイル: image.py プロジェクト: ritou11/d3rlpy
from d3rlpy.datasets import get_atari
from d3rlpy.algos import DiscreteCQL
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from sklearn.model_selection import train_test_split

dataset, env = get_atari('breakout-expert-v0')

train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

cql = DiscreteCQL(scaler='pixel',
                  n_frames=4,
                  augmentation=['random_shift', 'intensity'],
                  use_gpu=True)

cql.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=100,
        scorers={
            'environment': evaluate_on_environment(env, epsilon=0.05),
            'td_error': td_error_scorer,
            'discounted_advantage': discounted_sum_of_advantage_scorer,
            'value_scale': average_value_estimation_scorer
        })
コード例 #7
0
from d3rlpy.algos import DiscreteCQL
from d3rlpy.models.optimizers import AdamFactory
from d3rlpy.datasets import get_atari
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from sklearn.model_selection import train_test_split

dataset, env = get_atari('breakout-medium-v0')

_, test_episodes = train_test_split(dataset, test_size=0.2)

cql = DiscreteCQL(optim_factory=AdamFactory(eps=1e-2 / 32),
                  scaler='pixel',
                  n_frames=4,
                  q_func_factory='qr',
                  use_gpu=True)

cql.fit(dataset.episodes,
        eval_episodes=test_episodes,
        n_epochs=2000,
        scorers={
            'environment': evaluate_on_environment(env, epsilon=0.001),
            'value_scale': average_value_estimation_scorer
        })