def __call__(self, agent, selection_strategy, memory): observation = agent.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False # episode_memory = Memory(['action', 'state', 'reward', 'new_state', 'terminal', 'uncertainty'], episode_memory = Memory(agent.train_loader.memory_cell_names, gamma=memory.gamma) with eval_mode(agent): while not terminate: step_counter += 1 agent.to(agent.device) action, certainty = selection_strategy( agent, observation.to(agent.device)) new_observation, reward, terminate, _ = agent.env.step(action) episode_reward += torch.sum( reward).item() / agent.env.n_instances episode_memory.memorize( (action, observation, torch.tensor(reward).float(), new_observation, terminate, certainty.detach()), [ 'action', 'state', 'reward', 'new_state', 'terminal', 'uncertainty' ]) observation = new_observation[~terminate.view(-1)] terminate = terminate.min().item() memory.memorize(episode_memory, episode_memory.memory_cell_names) agent.train_dict['rewards'] = agent.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > agent.train_dict.get('best_performance', -np.inf): agent.train_dict['best_performance'] = episode_reward return episode_reward
def play_episode(self): observation = self.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False episode_memory = Memory( ['action', 'state', 'reward', 'new_state', 'terminal'], gamma=self.gamma) with eval_mode(self): while not terminate: step_counter += 1 with torch.no_grad(): action = self.chose_action(self, observation) new_observation, reward, terminate, _ = self.env.step(action) episode_reward += reward episode_memory.memorize( (action, observation, torch.tensor(reward).float(), new_observation, terminate), ['action', 'state', 'reward', 'new_state', 'terminal']) observation = new_observation self.train_loader.memorize(episode_memory, episode_memory.memory_cell_names) self.train_dict['rewards'] = self.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > self.train_dict.get('best_performance', -np.inf): self.train_dict['best_performance'] = episode_reward return episode_reward
def play_episode(self, render=False): """ Plays a single episode. This might need to be changed when using a non openAI gym environment. Args: render (bool): render environment Returns: episode reward """ observation = self.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False episode_memory_pg = Memory(['log_prob', 'reward', 'state'], gamma=self.gamma) episode_memory_q = Memory( ['action', 'state', 'reward', 'new_state', 'terminal'], gamma=self.gamma) while not terminate: step_counter += 1 action, log_prob = self.chose_action(self, observation) new_observation, reward, terminate, _ = self.env.step(action) episode_reward += reward episode_memory_pg.memorize( (log_prob, torch.tensor(reward).float(), observation), ['log_prob', 'reward', 'state']) episode_memory_q.memorize( (action, observation, torch.tensor(reward).float(), new_observation, terminate), ['action', 'state', 'reward', 'new_state', 'terminal']) observation = new_observation episode_memory_pg.cumul_reward() self.train_loader.memorize(episode_memory_pg, episode_memory_pg.memory_cell_names) self.critics.train_loader.memorize(episode_memory_q, episode_memory_q.memory_cell_names) self.train_dict['rewards'] = self.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > self.train_dict.get('best_performance', -np.inf): self.train_dict['best_performance'] = episode_reward return episode_reward
def __init__(self, critics, env, model, optimizer, n_samples, batch_size, crit=REINFORCELoss(), action_selector=sp.PolicyGradientActionSelection(), memory=None, memory_size=1000, gamma=.95, grad_clip=None, name='', callbacks=None, dump_path='./tmp', device='cpu', **kwargs): if memory is None and (memory_size is None or n_samples is None or batch_size is None): raise ValueError( 'Learner lacks the memory, it has to be explicitly given, or defined by the params:' '`memory_size`, `n_samples`, `batch_size`') if memory is not None and (memory_size is not None or n_samples is not None or batch_size is not None): raise ValueError( 'Ambiguous memory specification, either `memory` or `memory_size`, `n_samples`, ' '`batch_size` have to be provided') if memory is None: memory = Memory(['log_prob', 'reward', 'state'], memory_size=memory_size, n_samples=n_samples, gamma=gamma, batch_size=batch_size) super().__init__(env=env, model=model, optimizer=optimizer, n_samples=None, batch_size=None, crit=crit, action_selector=action_selector, memory=memory, memory_size=None, gamma=gamma, grad_clip=grad_clip, name=name, callbacks=callbacks, dump_path=dump_path, device=device, **kwargs) self.critics = critics # @todo probably create it in here
def play_episode(self): state_old = self.env.reset().detach() episode_reward = 0 step_counter = 0 terminate = False episode_memory = Memory([ 'action', 'state', 'reward', 'new_state', 'new_action', 'terminal' ], gamma=self.gamma) action_old = None while not terminate: with eval_mode(self): action = self.chose_action(self, state_old) state, reward, terminate, _ = self.env.step(action) episode_reward += reward if step_counter > 0: episode_memory.memorize( (action_old, state_old, torch.tensor(reward_old).float(), state, action, False), [ 'action', 'state', 'reward', 'new_state', 'new_action', 'terminal' ]) state_old = state reward_old = reward action_old = action step_counter += 1 # memorize final step episode_memory.memorize( (action_old, state_old, torch.tensor(reward_old).float(), state, action, True), [ 'action', 'state', 'reward', 'new_state', 'new_action', 'terminal' ]) self.train_loader.memorize(episode_memory, episode_memory.memory_cell_names) self.train_dict['rewards'] = self.train_dict.get( 'rewards', []) + [episode_reward] if episode_reward > self.train_dict.get('best_performance', -np.inf): self.train_dict['best_performance'] = episode_reward return episode_reward
def __init__(self, env, model, optimizer, n_samples=None, batch_size=None, crit=REINFORCELoss(), action_selector=sp.PolicyGradientActionSelection(), memory=None, memory_size=None, gamma=.95, grad_clip=None, name='', callbacks=None, dump_path='./tmp', device='cpu', *args, **kwargs): """ Policy Gradient learner. Args: env: environment to interact with model: neural network optimizer: optimizer memory_updater: memory updater, also implementing the update policy n_samples: number samples to sample for each update batch_size: batch size for updates crit: loss function action_selector: action selection strategy memory_size: memory size, storing passed memories gamma: discount factor for rewards over time grad_clip: gradient clipping name: name of the agent callbacks: list of callbacks to use during training dump_path: dump path for the model and the callbacks device: device to run the model on """ if memory is None and (memory_size is None or batch_size is None): raise ValueError( 'Learner lacks the memory, it has to be explicitly given, or defined by the params:' '`memory_size`, `batch_size`') if memory is not None and (memory_size is not None or batch_size is not None): raise ValueError( 'Ambiguous memory specification, either `memory` or `memory_size`, `batch_size` have to ' 'be provided') if memory is None: memory = Memory(['log_prob', 'reward'], memory_size=memory_size, n_samples=n_samples, gamma=gamma, batch_size=batch_size) super().__init__(model=model, optimizer=optimizer, crit=crit, env=env, gamma=gamma, memory=memory, action_selector=action_selector, grad_clip=grad_clip, name=name, callbacks=callbacks, dump_path=dump_path, device=device, *args, **kwargs)
def __init__(self, model, optimizer, crit, env, action_selector, alpha, gamma, memory_size=None, n_samples=None, batch_size=None, memory=None, grad_clip=None, name='q_learner', callbacks=[], dump_path='./tmp', device='cpu', **kwargs): """ Deep Q-Learning algorithm, as introduced by http://arxiv.org/abs/1312.5602 Args: model: pytorch graph derived from torch.nn.Module optimizer: optimizer crit: loss function env: environment to interact with memory_updater: object that iteratively updates the memory action_selector: policy after which actions are selected, it has to be a stochastic one to be used in learning alpha: TD-learning rate @todo this might be dropped in a future implementation gamma: disount factor for future rewards memory_size: size of the replay memory (number of memories to be hold) n_samples: number of samples to be drawn from the memory each update batch_size: batch size when updating memory: alternatively the memory can be explicitly specified, instead by (memory_size, n_samples, batch_size) grad_clip: gradient_clipping name: name for the learner callbacks: list of callbacks to be called during training dump_path: path to root folder, where the model and its callbacks is dumping stuff to device: device on which the learning has to be performed """ if memory is None and (memory_size is None or n_samples is None or batch_size is None): raise ValueError( 'Learner lacks the memory, it has to be explicitly given, or defined by the params:' '`memory_size`, `n_samples`, `batch_size`') if memory is not None and (memory_size is not None or n_samples is not None or batch_size is not None): raise ValueError( 'Ambiguous memory specification, either `memory` or `memory_size`, `n_samples`, ' '`batch_size` have to be provided') if memory is None: memory = Memory( ['action', 'state', 'reward', 'new_state', 'terminal'], memory_size=memory_size, n_samples=n_samples, gamma=gamma, batch_size=batch_size) super().__init__(model=model, optimizer=optimizer, crit=crit, env=env, gamma=gamma, memory=memory, action_selector=action_selector, grad_clip=grad_clip, name=name, callbacks=callbacks, dump_path=dump_path, device=device, **kwargs) self.train_dict['train_losses'] = [] self.alpha = alpha
def get_memory(key, params): if key == 'Memory': return Memory(**params) if key == 'PriorityMemory': return PriorityMemory(**params) raise ValueError('Unknown memory type')
def run(root, path_script): print(root, path_script) experiment = Experiment(root=root) factory = experiment.get_factory() params = experiment.get_params() params['factory_args']['learner_args']['dump_path'] = root Model = experiment.get_model_class() experiment.document_script(path_script, overwrite=params['overwrite']) # env = MultiInstanceGym(**params['factory_args']['env_args']) env = TorchGym(params['factory_args']['env_args']['env_name']) params['factory_args']['model_args'][ 'in_nodes'] = env.observation_space.shape[0] params['factory_args']['model_args']['out_nodes'] = env.action_space.n dqn_player = DQNPlayer() # selection_strategy = sp.AdaptiveQActionSelectionEntropy(warm_up=0, # post_pipeline=[EnsembleHatStd()]) # selection_strategy = sp.QActionSelection(post_pipeline=[EnsembleHat()]) selection_strategy = sp.EpsilonGreedyActionSelection( action_space=[0, 1, 2, 3], post_pipeline=[EnsembleHat()]) with with_experiment(experiment=experiment, overwrite=params['overwrite']): memory = Memory(**params["factory_args"]['memory_args']) params['factory_args']['learner_args']['memory'] = memory learner = DQNEnsemble( model_class=Model, trainer_factory=factory, memory=memory, env=env, player=dqn_player, selection_strategy=selection_strategy, trainer_args=params['factory_args'], n_model=params['n_learner'], callbacks=[ rcb.EpisodeUpdater(**params.get('memory_update', {})), cb.Checkpointer(frequency=1), # rcb.UncertaintyUpdater(), rcb.EnvironmentEvaluator( env=TorchGym( params['factory_args']['env_args']['env_name']), n_evaluations=10, action_selector=sp.GreedyValueSelection( post_pipeline=[EnsembleHat()]), metrics={ 'det_val_reward_mean': np.mean, 'deter_val_reward_std': np.std }, frequency=1, epoch_name='det_val_epoch'), rcb.EnvironmentEvaluator(env=TorchGym( params['factory_args']['env_args']['env_name']), n_evaluations=10, action_selector=selection_strategy, metrics={ 'prob_val_reward_mean': np.mean, 'prob_val_reward_std': np.std }, frequency=1, epoch_name='prob_val_epoch'), rcb.EnsembleRewardPlotter( metrics={ 'det_val_reward_mean': 'det_val_epoch', 'prob_val_reward_mean': 'prob_val_epoch', }), ]) # learner.load_checkpoint(path=f'{root}/checkpoint', tag='checkpoint') learner.fit(**params['fit'])
import torch from pymatch.ReinforcementLearning.memory import Memory memory = Memory( memory_cell_names=["test1", "test2"], n_samples=5, memory_size=10, batch_size=2, ) for i in range(10): memory.memorize([torch.tensor([[10*i + 1]]), torch.tensor([[10*i + 2]])], ["test1", "test2"]) print(memory.memory) memory[3] (1 - self.alpha) * target[mask] + self.alpha * ( reward + self.gamma * max_next * (1 - terminal.view(-1).type(torch.FloatTensor)).to(self.device)) test = (1 - self.alpha) * target[mask] + self.alpha * ( reward + self.gamma * max_next) test.shape