Ejemplo n.º 1
0
def get_ens_callbacks(params, env):
    return [
        rcb.EpisodeUpdater(**params.get('memory_update', {})),
        cb.Checkpointer(frequency=10),
        rcb.UncertaintyUpdater(head=hat.EntropyHat()),
        rcb.EnvironmentEvaluator(env=env,
                                 n_evaluations=10,
                                 action_selector=sp.GreedyValueSelection(
                                     post_pipeline=[hat.EnsembleHat()]),
                                 metrics={
                                     'det_val_reward_mean': np.mean,
                                     'deter_val_reward_std': np.std
                                 },
                                 frequency=10,
                                 epoch_name='det_val_epoch'),
        rcb.EnvironmentEvaluator(env=env,
                                 n_evaluations=10,
                                 action_selector=get_selection_strategy(
                                     params['eval_selection_strategy'],
                                     params.get('selection_args', {})),
                                 metrics={
                                     'prob_val_reward_mean': np.mean,
                                     'prob_val_reward_std': np.std
                                 },
                                 frequency=10,
                                 epoch_name='prob_val_epoch'),
        rcb.EnsembleRewardPlotter(frequency=10,
                                  metrics={
                                      'det_val_reward_mean': 'det_val_epoch',
                                      'prob_val_reward_mean': 'prob_val_epoch',
                                  })
    ]
Ejemplo n.º 2
0
def factory(Model, core, model_args, env_args, optim_args, memory_args,
            learner_args, name):
    model = Model(core, **model_args)
    env = CartPole(**env_args)
    memory_updater = MemoryUpdater(**memory_args)

    optim = torch.optim.SGD(model.parameters(), **optim_args)
    crit = torch.nn.MSELoss()
    learner_args['name'] = name

    return pg.QLearner(env=env,
                       model=model,
                       optimizer=optim,
                       memory_updater=memory_updater,
                       crit=crit,
                       action_selector=sp.QActionSelection(temperature=.3),
                       callbacks=[
                           cb.Checkpointer(),
                           rcb.EnvironmentEvaluator(env=env,
                                                    n_evaluations=10,
                                                    frequency=1),
                       ],
                       **learner_args)
Ejemplo n.º 3
0
def run(root, path_script):
    experiment = Experiment(root=root)
    factory = experiment.get_factory()
    params = experiment.get_params()
    # callbacks_factory = experiment.get_factory(source_function='get_ens_callbacks')
    params['factory_args']['learner_args']['dump_path'] = root
    # params['factory_args']['dump_path'] = root

    Model = experiment.get_model_class()
    experiment.document_script(path_script, overwrite=params['overwrite'])
    env = MultiInstanceGym(**params['env_args'])
    params['factory_args']['model_args'][
        'in_nodes'] = env.observation_space.shape[0]
    params['factory_args']['model_args']['out_nodes'] = env.action_space.n
    params['factory_args']['env'] = env

    dqn_player = get_player(key=params['player_type'])
    selection_strategy = get_selection_strategy(
        params['selection_strategy'], params.get('selection_args', {}))

    with with_experiment(experiment=experiment, overwrite=params['overwrite']):
        memory = get_memory(params['memory_type'], params['memory_args'])
        params['factory_args']['learner_args']['memory'] = memory

        learner = DQNEnsemble(
            model_class=Model,
            trainer_factory=factory,
            memory=memory,
            env=env,
            player=dqn_player,
            selection_strategy=selection_strategy,
            trainer_args=params['factory_args'],
            n_model=params['n_learner'],
            dump_path=root,
            callbacks=[
                rcb.EpisodeUpdater(**params.get('memory_update', {}),
                                   frequency=5),
                rcb.UncertaintyUpdater(hat=hat.EntropyHat()),
                cb.Checkpointer(frequency=10),
                rcb.EnvironmentEvaluator(
                    env=env,
                    n_evaluations=10,
                    action_selector=sp.GreedyValueSelection(
                        post_pipeline=[hat.EnsembleHat()]),
                    metrics={
                        'det_val_reward_mean': np.mean,
                        'deter_val_reward_std': np.std
                    },
                    frequency=10,
                    epoch_name='det_val_epoch'),
                rcb.EnvironmentEvaluator(
                    env=env,
                    n_evaluations=10,
                    action_selector=get_selection_strategy(
                        params['eval_selection_strategy'],
                        params.get('selection_args', {})),
                    metrics={
                        'prob_val_reward_mean': np.mean,
                        'prob_val_reward_std': np.std
                    },
                    frequency=10,
                    epoch_name='prob_val_epoch'),
                rcb.EnsembleRewardPlotter(frequency=10,
                                          metrics={
                                              'det_val_reward_mean':
                                              'det_val_epoch',
                                              'prob_val_reward_mean':
                                              'prob_val_epoch',
                                          })
            ])
        learner.fit(**params['fit'])
Ejemplo n.º 4
0
 memory=memory,
 env=env,
 player=dqn_player,
 selection_strategy=selection_strategy,
 trainer_args=params['factory_args'],
 n_model=params['n_learner'],
 callbacks=[
     rcb.EpisodeUpdater(**params.get('memory_update', {})),
     cb.Checkpointer(frequency=1),
     rcb.UncertaintyUpdater(),
     rcb.EnvironmentEvaluator(
         env=TorchGym(**params['factory_args']['env_args']),
         n_evaluations=10,
         action_selector=sp.GreedyValueSelection(
             post_pipeline=[EnsembleHat()]),
         metrics={
             'det_val_reward_mean': np.mean,
             'deter_val_reward_std': np.std
         },
         frequency=1,
         epoch_name='det_val_epoch'),
     rcb.EnvironmentEvaluator(
         env=TorchGym(**params['factory_args']['env_args']),
         n_evaluations=10,
         action_selector=sp.QActionSelection(temperature=params['temp'],
                                             post_pipeline=[EnsembleHat()]),
         metrics={
             'prob_val_reward_mean': np.mean,
             'prob_val_reward_std': np.std
         },
         frequency=1,
Ejemplo n.º 5
0
def run(root, path_script):
    print(root, path_script)
    experiment = Experiment(root=root)
    factory = experiment.get_factory()
    params = experiment.get_params()
    params['factory_args']['learner_args']['dump_path'] = root

    Model = experiment.get_model_class()
    experiment.document_script(path_script, overwrite=params['overwrite'])
    # env = MultiInstanceGym(**params['factory_args']['env_args'])
    env = TorchGym(params['factory_args']['env_args']['env_name'])
    params['factory_args']['model_args'][
        'in_nodes'] = env.observation_space.shape[0]
    params['factory_args']['model_args']['out_nodes'] = env.action_space.n

    dqn_player = DQNPlayer()
    # selection_strategy = sp.AdaptiveQActionSelectionEntropy(warm_up=0,
    #                                                         post_pipeline=[EnsembleHatStd()])
    # selection_strategy = sp.QActionSelection(post_pipeline=[EnsembleHat()])
    selection_strategy = sp.EpsilonGreedyActionSelection(
        action_space=[0, 1, 2, 3], post_pipeline=[EnsembleHat()])

    with with_experiment(experiment=experiment, overwrite=params['overwrite']):
        memory = Memory(**params["factory_args"]['memory_args'])
        params['factory_args']['learner_args']['memory'] = memory

        learner = DQNEnsemble(
            model_class=Model,
            trainer_factory=factory,
            memory=memory,
            env=env,
            player=dqn_player,
            selection_strategy=selection_strategy,
            trainer_args=params['factory_args'],
            n_model=params['n_learner'],
            callbacks=[
                rcb.EpisodeUpdater(**params.get('memory_update', {})),
                cb.Checkpointer(frequency=1),
                # rcb.UncertaintyUpdater(),
                rcb.EnvironmentEvaluator(
                    env=TorchGym(
                        params['factory_args']['env_args']['env_name']),
                    n_evaluations=10,
                    action_selector=sp.GreedyValueSelection(
                        post_pipeline=[EnsembleHat()]),
                    metrics={
                        'det_val_reward_mean': np.mean,
                        'deter_val_reward_std': np.std
                    },
                    frequency=1,
                    epoch_name='det_val_epoch'),
                rcb.EnvironmentEvaluator(env=TorchGym(
                    params['factory_args']['env_args']['env_name']),
                                         n_evaluations=10,
                                         action_selector=selection_strategy,
                                         metrics={
                                             'prob_val_reward_mean': np.mean,
                                             'prob_val_reward_std': np.std
                                         },
                                         frequency=1,
                                         epoch_name='prob_val_epoch'),
                rcb.EnsembleRewardPlotter(
                    metrics={
                        'det_val_reward_mean': 'det_val_epoch',
                        'prob_val_reward_mean': 'prob_val_epoch',
                    }),
            ])

        # learner.load_checkpoint(path=f'{root}/checkpoint', tag='checkpoint')
        learner.fit(**params['fit'])
Ejemplo n.º 6
0
    crit=crit,
    action_selector=rl.PolicyGradientActionSelection(),
    # action_selector=pg.BayesianDropoutActionSelection(50),
    gamma=.9,
    batch_size=256,
    n_samples=2048,
    grad_clip=20.,
    memory_size=1000,
    load_checkpoint=False,
    name='test_pg',
    # callbacks=[cb.LastRewardPlotter(),
    #            cb.RewardPlotter(),
    #            cb.SmoothedRewardPlotter(window=6),
    #            cb.EnvironmentEvaluator(env=env, render=True, frequency=5)],
    callbacks=[
        rcb.EnvironmentEvaluator(env=env, n_evaluations=10, frequency=5),
        rcb.AgentVisualizer(env=env, frequency=5),
        cb.MetricPlotter(frequency=1, metric='rewards', smoothing_window=100),
        cb.MetricPlotter(frequency=1,
                         metric='train_losses',
                         smoothing_window=100),
        cb.MetricPlotter(frequency=1, metric='avg_reward', smoothing_window=5),
        cb.MetricPlotter(frequency=5,
                         metric='val_reward',
                         x='val_epoch',
                         smoothing_window=5),
    ],
    dump_path='tests/policy_gradient/tmp',
    device='cpu')

learner.fit(100, 'cpu', restore_early_stopping=False, verbose=False)
Ejemplo n.º 7
0
else:
    path_scipt = sys.argv[0]
    root = sys.argv[1]

experiment = Experiment(root=root)
factory = experiment.get_factory()
params = experiment.get_params()
params['factory_args']['learner_args']['dump_path'] = root
Model = experiment.get_model_class()

core = experiment.get_model_class(source_file='core', source_class='Core')(**params['core_args'])
params['factory_args']['core'] = core

learner = Ensemble(model_class=Model,
                   trainer_factory=factory,
                   trainer_args=params['factory_args'],
                   n_model=params['n_learner'],
                   callbacks=[cb.EnvironmentEvaluator(
                       env=CartPole(**params['factory_args']['env_args']),
                       n_evaluations=10,
                       action_selector=GreedyValueSelection(
                           post_pipeline=[EnsembleHat()]
                       )),
                       cb.EnsembleRewardPlotter()])

# for param_group in learner.learners[0].optimizer.param_groups:
#     param_group['lr'] = param_group['lr'] * 10

# learner.load_checkpoint(f'{root}/checkpoint')
learner.fit(**params['fit'])
# learner.resume_training(params['n_epochs'], 'cpu', restore_early_stopping=False, verbose=False)