Exemple #1
0
    largs = dict(learner_args)
    largs['name'] = f"{learner_args['name']}_{name}"
    learner = rl.PolicyGradient(env=env,
                                model=model,
                                optimizer=optim,
                                memory_updater=memory_updater,
                                crit=crit,
                                action_selector=sp.PolicyGradientActionSelection(),
                                callbacks=[],
                                **largs
                                )
    return learner


root = 'tests/experiment'

experiment = Experiment(root=root)
params = experiment.get_params()
params['factory_args']['learner_args']['dump_path'] = root
params['factory_args']['core'] = Core(**params['core_args'])

with with_experiment(experiment=experiment, overwrite=params['overwrite']):
    learner = Ensemble(model_class=Model,
                       trainer_factory=factory,
                       trainer_args=params['factory_args'],
                       n_model=params['n_learner'],
                       callbacks=[])
    learner.fit(**params['fit'])
    print('done training')
    # raise NotImplemented
Exemple #2
0
def run(root, path_script):
    experiment = Experiment(root=root)
    factory = experiment.get_factory()
    params = experiment.get_params()
    # callbacks_factory = experiment.get_factory(source_function='get_ens_callbacks')
    params['factory_args']['learner_args']['dump_path'] = root
    # params['factory_args']['dump_path'] = root

    Model = experiment.get_model_class()
    experiment.document_script(path_script, overwrite=params['overwrite'])
    env = MultiInstanceGym(**params['env_args'])
    params['factory_args']['model_args'][
        'in_nodes'] = env.observation_space.shape[0]
    params['factory_args']['model_args']['out_nodes'] = env.action_space.n
    params['factory_args']['env'] = env

    dqn_player = get_player(key=params['player_type'])
    selection_strategy = get_selection_strategy(
        params['selection_strategy'], params.get('selection_args', {}))

    with with_experiment(experiment=experiment, overwrite=params['overwrite']):
        memory = get_memory(params['memory_type'], params['memory_args'])
        params['factory_args']['learner_args']['memory'] = memory

        learner = DQNEnsemble(
            model_class=Model,
            trainer_factory=factory,
            memory=memory,
            env=env,
            player=dqn_player,
            selection_strategy=selection_strategy,
            trainer_args=params['factory_args'],
            n_model=params['n_learner'],
            dump_path=root,
            callbacks=[
                rcb.EpisodeUpdater(**params.get('memory_update', {}),
                                   frequency=5),
                rcb.UncertaintyUpdater(hat=hat.EntropyHat()),
                cb.Checkpointer(frequency=10),
                rcb.EnvironmentEvaluator(
                    env=env,
                    n_evaluations=10,
                    action_selector=sp.GreedyValueSelection(
                        post_pipeline=[hat.EnsembleHat()]),
                    metrics={
                        'det_val_reward_mean': np.mean,
                        'deter_val_reward_std': np.std
                    },
                    frequency=10,
                    epoch_name='det_val_epoch'),
                rcb.EnvironmentEvaluator(
                    env=env,
                    n_evaluations=10,
                    action_selector=get_selection_strategy(
                        params['eval_selection_strategy'],
                        params.get('selection_args', {})),
                    metrics={
                        'prob_val_reward_mean': np.mean,
                        'prob_val_reward_std': np.std
                    },
                    frequency=10,
                    epoch_name='prob_val_epoch'),
                rcb.EnsembleRewardPlotter(frequency=10,
                                          metrics={
                                              'det_val_reward_mean':
                                              'det_val_epoch',
                                              'prob_val_reward_mean':
                                              'prob_val_epoch',
                                          })
            ])
        learner.fit(**params['fit'])
Exemple #3
0
def run(root, path_script):
    print(root, path_script)
    experiment = Experiment(root=root)
    factory = experiment.get_factory()
    params = experiment.get_params()
    params['factory_args']['learner_args']['dump_path'] = root

    Model = experiment.get_model_class()
    experiment.document_script(path_script, overwrite=params['overwrite'])
    # env = MultiInstanceGym(**params['factory_args']['env_args'])
    env = TorchGym(params['factory_args']['env_args']['env_name'])
    params['factory_args']['model_args'][
        'in_nodes'] = env.observation_space.shape[0]
    params['factory_args']['model_args']['out_nodes'] = env.action_space.n

    dqn_player = DQNPlayer()
    # selection_strategy = sp.AdaptiveQActionSelectionEntropy(warm_up=0,
    #                                                         post_pipeline=[EnsembleHatStd()])
    # selection_strategy = sp.QActionSelection(post_pipeline=[EnsembleHat()])
    selection_strategy = sp.EpsilonGreedyActionSelection(
        action_space=[0, 1, 2, 3], post_pipeline=[EnsembleHat()])

    with with_experiment(experiment=experiment, overwrite=params['overwrite']):
        memory = Memory(**params["factory_args"]['memory_args'])
        params['factory_args']['learner_args']['memory'] = memory

        learner = DQNEnsemble(
            model_class=Model,
            trainer_factory=factory,
            memory=memory,
            env=env,
            player=dqn_player,
            selection_strategy=selection_strategy,
            trainer_args=params['factory_args'],
            n_model=params['n_learner'],
            callbacks=[
                rcb.EpisodeUpdater(**params.get('memory_update', {})),
                cb.Checkpointer(frequency=1),
                # rcb.UncertaintyUpdater(),
                rcb.EnvironmentEvaluator(
                    env=TorchGym(
                        params['factory_args']['env_args']['env_name']),
                    n_evaluations=10,
                    action_selector=sp.GreedyValueSelection(
                        post_pipeline=[EnsembleHat()]),
                    metrics={
                        'det_val_reward_mean': np.mean,
                        'deter_val_reward_std': np.std
                    },
                    frequency=1,
                    epoch_name='det_val_epoch'),
                rcb.EnvironmentEvaluator(env=TorchGym(
                    params['factory_args']['env_args']['env_name']),
                                         n_evaluations=10,
                                         action_selector=selection_strategy,
                                         metrics={
                                             'prob_val_reward_mean': np.mean,
                                             'prob_val_reward_std': np.std
                                         },
                                         frequency=1,
                                         epoch_name='prob_val_epoch'),
                rcb.EnsembleRewardPlotter(
                    metrics={
                        'det_val_reward_mean': 'det_val_epoch',
                        'prob_val_reward_mean': 'prob_val_epoch',
                    }),
            ])

        # learner.load_checkpoint(path=f'{root}/checkpoint', tag='checkpoint')
        learner.fit(**params['fit'])
Exemple #4
0
from scipy.stats import entropy
import torch

from pymatch.ReinforcementLearning.memory import PriorityMemory
from pymatch.DeepLearning.hat import EnsembleHatStd, EnsembleHat
import pymatch.DeepLearning.callback as cb
import pymatch.ReinforcementLearning.callback as rcb
import pymatch.ReinforcementLearning.selection_policy as sp
from pymatch.ReinforcementLearning.torch_gym import TorchGym
from pymatch.utils.experiment import Experiment, with_experiment
from pymatch.utils.functional import interactive_python_mode, one_hot_encoding
from pymatch.DeepLearning.ensemble import DQNEnsemble
from pymatch.ReinforcementLearning.player import DQNPlayerCertainty

root = 'tests/test_experiment/exp_1'
experiment = Experiment(root=root)
factory = experiment.get_factory()
params = experiment.get_params()
params['factory_args']['learner_args']['dump_path'] = root

Model = experiment.get_model_class()
env = TorchGym(**params['factory_args']['env_args'])
params['factory_args']['model_args'][
    'in_nodes'] = env.env.observation_space.shape[0]
params['factory_args']['model_args']['out_nodes'] = env.action_space.n

dqn_player = DQNPlayerCertainty()
selection_strategy = sp.AdaptiveQActionSelection(
    temperature=params['temp'],
    min_length=10,
    post_pipeline=[EnsembleHatStd()])
Exemple #5
0
import pymatch.ReinforcementLearning.callback as cb
from pymatch.ReinforcementLearning.selection_policy import GreedyValueSelection
from pymatch.ReinforcementLearning.torch_gym import TorchGym, CartPole
from pymatch.utils.experiment import Experiment
from pymatch.utils.functional import interactive_python_mode
from pymatch.DeepLearning.ensemble import Ensemble


if interactive_python_mode():
    path_scipt = 'tests/boosting/train_boosting.py'
    root = 'tests/boosting/exp'
else:
    path_scipt = sys.argv[0]
    root = sys.argv[1]

experiment = Experiment(root=root)
factory = experiment.get_factory()
params = experiment.get_params()
params['factory_args']['learner_args']['dump_path'] = root
Model = experiment.get_model_class()

core = experiment.get_model_class(source_file='core', source_class='Core')(**params['core_args'])
params['factory_args']['core'] = core

learner = Ensemble(model_class=Model,
                   trainer_factory=factory,
                   trainer_args=params['factory_args'],
                   n_model=params['n_learner'],
                   callbacks=[cb.EnvironmentEvaluator(
                       env=CartPole(**params['factory_args']['env_args']),
                       n_evaluations=10,