Ejemplo n.º 1
0
    def __call__(self, agent, selection_strategy, memory):
        observation = agent.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False

        # episode_memory = Memory(['action', 'state', 'reward', 'new_state', 'terminal', 'uncertainty'],
        episode_memory = Memory(agent.train_loader.memory_cell_names,
                                gamma=memory.gamma)
        with eval_mode(agent):
            while not terminate:
                step_counter += 1
                agent.to(agent.device)
                action, certainty = selection_strategy(
                    agent, observation.to(agent.device))
                new_observation, reward, terminate, _ = agent.env.step(action)

                episode_reward += torch.sum(
                    reward).item() / agent.env.n_instances
                episode_memory.memorize(
                    (action, observation, torch.tensor(reward).float(),
                     new_observation, terminate, certainty.detach()), [
                         'action', 'state', 'reward', 'new_state', 'terminal',
                         'uncertainty'
                     ])
                observation = new_observation[~terminate.view(-1)]
                terminate = terminate.min().item()
        memory.memorize(episode_memory, episode_memory.memory_cell_names)
        agent.train_dict['rewards'] = agent.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > agent.train_dict.get('best_performance', -np.inf):
            agent.train_dict['best_performance'] = episode_reward

        return episode_reward
Ejemplo n.º 2
0
    def play_episode(self):
        observation = self.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False
        episode_memory = Memory(
            ['action', 'state', 'reward', 'new_state', 'terminal'],
            gamma=self.gamma)
        with eval_mode(self):
            while not terminate:
                step_counter += 1
                with torch.no_grad():
                    action = self.chose_action(self, observation)
                new_observation, reward, terminate, _ = self.env.step(action)

                episode_reward += reward
                episode_memory.memorize(
                    (action, observation, torch.tensor(reward).float(),
                     new_observation, terminate),
                    ['action', 'state', 'reward', 'new_state', 'terminal'])
                observation = new_observation

        self.train_loader.memorize(episode_memory,
                                   episode_memory.memory_cell_names)
        self.train_dict['rewards'] = self.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > self.train_dict.get('best_performance', -np.inf):
            self.train_dict['best_performance'] = episode_reward

        return episode_reward
Ejemplo n.º 3
0
    def play_episode(self, render=False):
        """
        Plays a single episode.
        This might need to be changed when using a non openAI gym environment.

        Args:
            render (bool): render environment

        Returns:
            episode reward
        """
        observation = self.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False
        episode_memory_pg = Memory(['log_prob', 'reward', 'state'],
                                   gamma=self.gamma)
        episode_memory_q = Memory(
            ['action', 'state', 'reward', 'new_state', 'terminal'],
            gamma=self.gamma)

        while not terminate:
            step_counter += 1
            action, log_prob = self.chose_action(self, observation)
            new_observation, reward, terminate, _ = self.env.step(action)

            episode_reward += reward
            episode_memory_pg.memorize(
                (log_prob, torch.tensor(reward).float(), observation),
                ['log_prob', 'reward', 'state'])
            episode_memory_q.memorize(
                (action, observation, torch.tensor(reward).float(),
                 new_observation, terminate),
                ['action', 'state', 'reward', 'new_state', 'terminal'])
            observation = new_observation

        episode_memory_pg.cumul_reward()
        self.train_loader.memorize(episode_memory_pg,
                                   episode_memory_pg.memory_cell_names)
        self.critics.train_loader.memorize(episode_memory_q,
                                           episode_memory_q.memory_cell_names)
        self.train_dict['rewards'] = self.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > self.train_dict.get('best_performance', -np.inf):
            self.train_dict['best_performance'] = episode_reward

        return episode_reward
Ejemplo n.º 4
0
 def __init__(self,
              critics,
              env,
              model,
              optimizer,
              n_samples,
              batch_size,
              crit=REINFORCELoss(),
              action_selector=sp.PolicyGradientActionSelection(),
              memory=None,
              memory_size=1000,
              gamma=.95,
              grad_clip=None,
              name='',
              callbacks=None,
              dump_path='./tmp',
              device='cpu',
              **kwargs):
     if memory is None and (memory_size is None or n_samples is None
                            or batch_size is None):
         raise ValueError(
             'Learner lacks the memory, it has to be explicitly given, or defined by the params:'
             '`memory_size`, `n_samples`, `batch_size`')
     if memory is not None and (memory_size is not None or n_samples
                                is not None or batch_size is not None):
         raise ValueError(
             'Ambiguous memory specification, either `memory` or `memory_size`, `n_samples`, '
             '`batch_size` have to be provided')
     if memory is None:
         memory = Memory(['log_prob', 'reward', 'state'],
                         memory_size=memory_size,
                         n_samples=n_samples,
                         gamma=gamma,
                         batch_size=batch_size)
     super().__init__(env=env,
                      model=model,
                      optimizer=optimizer,
                      n_samples=None,
                      batch_size=None,
                      crit=crit,
                      action_selector=action_selector,
                      memory=memory,
                      memory_size=None,
                      gamma=gamma,
                      grad_clip=grad_clip,
                      name=name,
                      callbacks=callbacks,
                      dump_path=dump_path,
                      device=device,
                      **kwargs)
     self.critics = critics  # @todo probably create it in here
Ejemplo n.º 5
0
    def play_episode(self):
        state_old = self.env.reset().detach()
        episode_reward = 0
        step_counter = 0
        terminate = False
        episode_memory = Memory([
            'action', 'state', 'reward', 'new_state', 'new_action', 'terminal'
        ],
                                gamma=self.gamma)

        action_old = None

        while not terminate:
            with eval_mode(self):
                action = self.chose_action(self, state_old)
            state, reward, terminate, _ = self.env.step(action)

            episode_reward += reward
            if step_counter > 0:
                episode_memory.memorize(
                    (action_old, state_old, torch.tensor(reward_old).float(),
                     state, action, False), [
                         'action', 'state', 'reward', 'new_state',
                         'new_action', 'terminal'
                     ])
            state_old = state
            reward_old = reward
            action_old = action
            step_counter += 1

        # memorize final step
        episode_memory.memorize(
            (action_old, state_old, torch.tensor(reward_old).float(), state,
             action, True), [
                 'action', 'state', 'reward', 'new_state', 'new_action',
                 'terminal'
             ])

        self.train_loader.memorize(episode_memory,
                                   episode_memory.memory_cell_names)
        self.train_dict['rewards'] = self.train_dict.get(
            'rewards', []) + [episode_reward]

        if episode_reward > self.train_dict.get('best_performance', -np.inf):
            self.train_dict['best_performance'] = episode_reward

        return episode_reward
Ejemplo n.º 6
0
    def __init__(self,
                 env,
                 model,
                 optimizer,
                 n_samples=None,
                 batch_size=None,
                 crit=REINFORCELoss(),
                 action_selector=sp.PolicyGradientActionSelection(),
                 memory=None,
                 memory_size=None,
                 gamma=.95,
                 grad_clip=None,
                 name='',
                 callbacks=None,
                 dump_path='./tmp',
                 device='cpu',
                 *args,
                 **kwargs):
        """
        Policy Gradient learner.

        Args:
            env:                environment to interact with
            model:              neural network
            optimizer:          optimizer
            memory_updater:     memory updater, also implementing the update policy
            n_samples:          number samples to sample for each update
            batch_size:         batch size for updates
            crit:               loss function
            action_selector:    action selection strategy
            memory_size:        memory size, storing passed memories
            gamma:              discount factor for rewards over time
            grad_clip:          gradient clipping
            name:               name of the agent
            callbacks:          list of callbacks to use during training
            dump_path:          dump path for the model and the callbacks
            device:             device to run the model on
        """
        if memory is None and (memory_size is None or batch_size is None):
            raise ValueError(
                'Learner lacks the memory, it has to be explicitly given, or defined by the params:'
                '`memory_size`, `batch_size`')
        if memory is not None and (memory_size is not None
                                   or batch_size is not None):
            raise ValueError(
                'Ambiguous memory specification, either `memory` or `memory_size`, `batch_size` have to '
                'be provided')
        if memory is None:
            memory = Memory(['log_prob', 'reward'],
                            memory_size=memory_size,
                            n_samples=n_samples,
                            gamma=gamma,
                            batch_size=batch_size)
        super().__init__(model=model,
                         optimizer=optimizer,
                         crit=crit,
                         env=env,
                         gamma=gamma,
                         memory=memory,
                         action_selector=action_selector,
                         grad_clip=grad_clip,
                         name=name,
                         callbacks=callbacks,
                         dump_path=dump_path,
                         device=device,
                         *args,
                         **kwargs)
Ejemplo n.º 7
0
 def __init__(self,
              model,
              optimizer,
              crit,
              env,
              action_selector,
              alpha,
              gamma,
              memory_size=None,
              n_samples=None,
              batch_size=None,
              memory=None,
              grad_clip=None,
              name='q_learner',
              callbacks=[],
              dump_path='./tmp',
              device='cpu',
              **kwargs):
     """
     Deep Q-Learning algorithm, as introduced by http://arxiv.org/abs/1312.5602
     Args:
         model:              pytorch graph derived from torch.nn.Module
         optimizer:          optimizer
         crit:               loss function
         env:                environment to interact with
         memory_updater:     object that iteratively updates the memory
         action_selector:    policy after which actions are selected, it has to be a stochastic one to be used in
                             learning
         alpha:              TD-learning rate @todo this might be dropped in a future implementation
         gamma:              disount factor for future rewards
         memory_size:        size of the replay memory (number of memories to be hold)
         n_samples:          number of samples to be drawn from the memory each update
         batch_size:         batch size when updating
         memory:             alternatively the memory can be explicitly specified, instead by (memory_size,
                             n_samples, batch_size)
         grad_clip:          gradient_clipping
         name:               name for the learner
         callbacks:          list of callbacks to be called during training
         dump_path:          path to root folder, where the model and its callbacks is dumping stuff to
         device:             device on which the learning has to be performed
     """
     if memory is None and (memory_size is None or n_samples is None
                            or batch_size is None):
         raise ValueError(
             'Learner lacks the memory, it has to be explicitly given, or defined by the params:'
             '`memory_size`, `n_samples`, `batch_size`')
     if memory is not None and (memory_size is not None or n_samples
                                is not None or batch_size is not None):
         raise ValueError(
             'Ambiguous memory specification, either `memory` or `memory_size`, `n_samples`, '
             '`batch_size` have to be provided')
     if memory is None:
         memory = Memory(
             ['action', 'state', 'reward', 'new_state', 'terminal'],
             memory_size=memory_size,
             n_samples=n_samples,
             gamma=gamma,
             batch_size=batch_size)
     super().__init__(model=model,
                      optimizer=optimizer,
                      crit=crit,
                      env=env,
                      gamma=gamma,
                      memory=memory,
                      action_selector=action_selector,
                      grad_clip=grad_clip,
                      name=name,
                      callbacks=callbacks,
                      dump_path=dump_path,
                      device=device,
                      **kwargs)
     self.train_dict['train_losses'] = []
     self.alpha = alpha
Ejemplo n.º 8
0
def get_memory(key, params):
    if key == 'Memory':
        return Memory(**params)
    if key == 'PriorityMemory':
        return PriorityMemory(**params)
    raise ValueError('Unknown memory type')
Ejemplo n.º 9
0
def run(root, path_script):
    print(root, path_script)
    experiment = Experiment(root=root)
    factory = experiment.get_factory()
    params = experiment.get_params()
    params['factory_args']['learner_args']['dump_path'] = root

    Model = experiment.get_model_class()
    experiment.document_script(path_script, overwrite=params['overwrite'])
    # env = MultiInstanceGym(**params['factory_args']['env_args'])
    env = TorchGym(params['factory_args']['env_args']['env_name'])
    params['factory_args']['model_args'][
        'in_nodes'] = env.observation_space.shape[0]
    params['factory_args']['model_args']['out_nodes'] = env.action_space.n

    dqn_player = DQNPlayer()
    # selection_strategy = sp.AdaptiveQActionSelectionEntropy(warm_up=0,
    #                                                         post_pipeline=[EnsembleHatStd()])
    # selection_strategy = sp.QActionSelection(post_pipeline=[EnsembleHat()])
    selection_strategy = sp.EpsilonGreedyActionSelection(
        action_space=[0, 1, 2, 3], post_pipeline=[EnsembleHat()])

    with with_experiment(experiment=experiment, overwrite=params['overwrite']):
        memory = Memory(**params["factory_args"]['memory_args'])
        params['factory_args']['learner_args']['memory'] = memory

        learner = DQNEnsemble(
            model_class=Model,
            trainer_factory=factory,
            memory=memory,
            env=env,
            player=dqn_player,
            selection_strategy=selection_strategy,
            trainer_args=params['factory_args'],
            n_model=params['n_learner'],
            callbacks=[
                rcb.EpisodeUpdater(**params.get('memory_update', {})),
                cb.Checkpointer(frequency=1),
                # rcb.UncertaintyUpdater(),
                rcb.EnvironmentEvaluator(
                    env=TorchGym(
                        params['factory_args']['env_args']['env_name']),
                    n_evaluations=10,
                    action_selector=sp.GreedyValueSelection(
                        post_pipeline=[EnsembleHat()]),
                    metrics={
                        'det_val_reward_mean': np.mean,
                        'deter_val_reward_std': np.std
                    },
                    frequency=1,
                    epoch_name='det_val_epoch'),
                rcb.EnvironmentEvaluator(env=TorchGym(
                    params['factory_args']['env_args']['env_name']),
                                         n_evaluations=10,
                                         action_selector=selection_strategy,
                                         metrics={
                                             'prob_val_reward_mean': np.mean,
                                             'prob_val_reward_std': np.std
                                         },
                                         frequency=1,
                                         epoch_name='prob_val_epoch'),
                rcb.EnsembleRewardPlotter(
                    metrics={
                        'det_val_reward_mean': 'det_val_epoch',
                        'prob_val_reward_mean': 'prob_val_epoch',
                    }),
            ])

        # learner.load_checkpoint(path=f'{root}/checkpoint', tag='checkpoint')
        learner.fit(**params['fit'])
Ejemplo n.º 10
0
import torch
from pymatch.ReinforcementLearning.memory import Memory

memory = Memory(
    memory_cell_names=["test1", "test2"],
    n_samples=5,
    memory_size=10,
    batch_size=2,
)

for i in range(10):
    memory.memorize([torch.tensor([[10*i + 1]]), torch.tensor([[10*i + 2]])], ["test1", "test2"])

print(memory.memory)

memory[3]


(1 - self.alpha) * target[mask] + self.alpha * (
                    reward + self.gamma * max_next * (1 - terminal.view(-1).type(torch.FloatTensor)).to(self.device))

test = (1 - self.alpha) * target[mask] + self.alpha * (
                    reward + self.gamma * max_next)
test.shape