def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins." """ if not self.compiled: raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step(observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, starting_checkpoints=[], verbose=1): """Callback that is called before training begins." """ if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in xrange(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() if starting_checkpoints: checkpoint = np.random.choice(starting_checkpoints) observation = deepcopy( env.reset(checkpoint='checkpoints/{}'.format(checkpoint))) else: observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in xrange(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) #print action observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in xrange(action_repetition): callbacks.on_action_begin(action) #print action observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step( observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True print "-----------------------------------" self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, 'global_score': info["global_score"] } if self.evaluating_states: episode_logs['avarage_q'] = self.compute_avarage_q( self.evaluating_states ) # computation is delegated to agent if starting_checkpoints: episode_logs['checkpoint'] = checkpoint callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
class RemoteAdfp(object): def __init__(self, agent: ADFPAgent, training_steps, log_interval, folder_path, callbacks=None, mode='train', processor=None): # Prepare Callbacks callbacks = [] if not callbacks else callbacks[:] callbacks += [TrainIntervalLogger(interval=log_interval)] self.callbacks = CallbackList(callbacks) if hasattr(self.callbacks, 'set_model'): self.callbacks.set_model(agent) else: self.callbacks._set_model(agent) params = { 'nb_steps': training_steps, } if hasattr(self.callbacks, 'set_params'): self.callbacks.set_params(params) else: self.callbacks._set_params(params) self.callbacks.on_train_begin() self.no_training_steps = training_steps # Create needed directories if not done yet self.folder_path = folder_path checkpoint_path = folder_path + '/checkpoints' if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) # Parameters self.agent = agent self.episode_step = 0 self.episode = 0 self.episode_reward = 0 self.step = 0 self.recent_action = None self.recent_observation = None self.mode = mode self.processor = processor def train_move(self, raw_observation, measurement, goal_params, done): self.update() self.callbacks.on_step_begin(self.episode_step) if self.processor is not None: raw_observation = self.processor.process_observation( observation=raw_observation) measurement = self.processor.process_measurement(measurement) # If we have a list of goal_params just take one element for evaluation. eval_goal_params = goal_params if not isinstance( goal_params[0], list) else goal_params[-1] reward = self.agent.goal.immediate_reward_function( measurement, eval_goal_params) if self.step > 1: metrics = self.agent.backward(measurements=measurement, terminal=done) step_logs = { 'action': self.recent_action, 'observation': self.recent_observation, 'reward': reward, 'metrics': metrics, 'episode': self.episode, 'info': {}, } self.episode_reward += reward self.callbacks.on_step_end(self.episode_step, step_logs) # perform next step if done: # report episode_logs = { 'episode_reward': self.episode_reward, 'nb_episode_steps': self.episode_step, 'reward_per_step': self.episode_reward / self.episode_step } self.callbacks.on_episode_end(self.episode, episode_logs) self.episode += 1 self.episode_step = 0 self.episode_reward = 0. return action = self.agent.forward(observation=Observation( raw_features=raw_observation, measurements=measurement), goal_params=goal_params) # Update params for next backprop self.recent_observation = raw_observation self.recent_action = action return action def test_move(self, raw_observation, measurement, goal_params): if self.processor is not None: raw_observation = self.processor.process_observation( observation=raw_observation) measurement = self.processor.process_measurement(measurement) return self.agent.forward(Observation(raw_features=raw_observation, measurements=measurement), goal_params=goal_params) def save(self): self.agent.save(self.folder_path) def update(self): if self.episode_step == 0: self.callbacks.on_episode_begin(self.episode) # Is training ended yet? if self.step >= self.no_training_steps: self.save() # We are done here. self.callbacks.on_train_end() sys.exit(0) self.episode_step += 1 self.step += 1
def fit(self, env, env_1, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, file_interval=200,nb_max_episode_steps=None,save_data_path='temp.json', dynamic_actor_exploration=False, update_exploration_interval=5000): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.episode_goal = None # (resets every episode) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] callbacks += [OjasFileLogger(save_data_path,interval=file_interval)] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation1 = None observation2 = None episode_reward1 = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation1 is None or observation2 is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward1 = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation1 = deepcopy(env.reset()) observation2 = deepcopy(env_1.reset()) if self.actor_processor is not None: # observation1 = self.learner_processor.process_observation(observation1) observation1 = self.actor_processor.process_observation(observation1) observation2 = self.actor_processor.process_observation(observation2) assert observation1 is not None assert observation2 is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action1 = env.action_space.sample() action2 = env_1.action_space.sample() else: action1 = start_step_policy(observation1) action2 = start_step_policy(observation2) if self.actor_processor is not None: # action1 = self.learner_processor.process_action(action1) action1 = self.actor_processor.process_action(action1) action2 = self.actor_processor.process_action(action2) callbacks.on_action_begin(action1) observation1, reward1, done1, info1 = env.step(action1) observation2, reward2, done2, info2 = env_1.step(action2) observation1 = deepcopy(observation1) observation2 = deepcopy(observation2) if self.actor_processor is not None: # observation1, reward1, done1, info1 = self.learner_processor.process_step(observation1, reward1, done1, info1) observation1, reward1, done1, info1 = self.actor_processor.process_step(observation1, reward1, done1, info1) observation2, reward2, done2, info2 = self.actor_processor.process_step(observation2, reward2, done2, info2) callbacks.on_action_end(action1) if done1: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation1 = deepcopy(env.reset()) # if self.learner_processor is not None: if self.actor_processor is not None: # observation1 = self.learner_processor.process_observation(observation1) observation1 = self.actor_processor.process_observation(observation1) break if done2: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation2 = deepcopy(env_1.reset()) if self.actor_processor is not None: observation2 = self.actor_processor.process_observation(observation2) break # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_step is not None assert observation1 is not None assert observation2 is not None # Run a single step. callbacks.on_step_begin(episode_step) # (Prints here if verbose = 1) # This is where all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1, action2 = self.forward(observation1, observation2) if self.actor_processor is not None: # action1 = self.learner_processor.process_action(action1) action1 = self.actor_processor.process_action(action1) action2 = self.actor_processor.process_action(action2) reward1 = 0. reward2 = 0. accumulated_info = {} done1 = False done2 = False for _ in range(action_repetition): callbacks.on_action_begin(action1) observation1, r1, done1, info1 = env.step(action1) observation1 = deepcopy(observation1) # if self.learner_processor is not None: # observation1, r1, done1, info1 = self.learner_processor.process_step(observation1, r1, done1, info1) if self.actor_processor is not None: observation1, r1, done1, info1 = self.actor_processor.process_step(observation1, r1, done1, info1) for key, value in info1.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action1) reward1 += r1 if done1: break for _ in range(action_repetition): observation2, r2, done2, info2 = env_1.step(action2) observation2 = deepcopy(observation2) if self.actor_processor is not None: observation2, r2, done2, info2 = self.actor_processor.process_step(observation2, r2, done2, info2) reward2 += r2 if done2: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. (both agents take every step parallely) done1 = True done2 = True self.backward_actor(reward1, observation1, info1, reward2, observation2, info2, env, terminal1=done1, terminal2=done2) metrics = self.backward_learner() episode_reward1 += reward1 step_logs = { 'action': action1, 'observation': observation1, 'reward': reward1, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) ## stores the current step info if(self.step%update_exploration_interval and dynamic_actor_exploration==True): self.update_actor_exploration() episode_step += 1 self.step += 1 if done1: # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) # print("Episode: {}, Rewards: {}, Steps: {}".format(episode,episode_logs['episode_reward'],episode_logs['nb_episode_steps'])) episode += 1 ## CHECK! observation1 = None episode_step = None episode_reward1 = None if done2: observation2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, version=None, custom_env=False): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True #self.stop_training = False # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() # get the history class callbacks += [history] # Assign history to callback callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None #self.episode_step = None # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 did_abort = False # open workbook to store result workbook = xlwt.Workbook() sheet = workbook.add_sheet('DQN') sheet_step = workbook.add_sheet('step') try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) #self.episode_step = np.int16(0) # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None #assert self.episode_step is not None # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) #callbacks.on_step_begin(callbacks.on_step_begin(self.episode_step)) # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) # print(observation, r, done, info) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: #if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True if (custom_env): metrics = self.backward(reward[0], terminal=done) # tran's version else: metrics = self.backward( reward, terminal=done) # for testing with dqn_cartpole episode_reward += reward if (custom_env): step_logs = { 'action': action, 'observation': observation, 'reward': reward[0], # tran's version 'metrics': metrics, 'episode': episode, 'info': accumulated_info, 'throughput': reward[1], } else: step_logs = { 'action': action, 'observation': observation, 'reward': reward, # for testing with dqn_cartpole 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 #callbacks.on_step_end(self.episode_step, step_logs) # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 #self.episode_step += 1 # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. if (custom_env): episode_logs = { 'episode_reward': episode_reward[0], # Only return the first value 'throughput': episode_reward[1], #'nb_episode_steps': episode_step, #'nb_steps': self.step, #'loss': history['loss'], } else: episode_logs = { 'episode_reward': episode_reward, # seems to return an array 'nb_episode_steps': episode_step, 'nb_steps': self.step, } print("Episode Number: ", episode) print("Episode Rewards: ", episode_reward) #print("Episode Logs", episode_logs) #print("Episode metrics", metrics) print(history.history.keys()) # print("History Loss", hist.history['loss']) # print("History Loss", hist.history['acc']) # print("History Loss", hist.history['val_loss']) # print("History Loss", hist.history['val_acc']) callbacks.on_episode_end(episode, episode_logs) #print("Episode Reward size is: ", len(episode_reward)) #print("Reward array size is: ", episode_reward) sheet.write(episode + 1, 0, str(episode)) sheet.write(episode + 1, 1, str(episode_reward[0])) sheet.write(episode + 1, 2, str(episode_reward[1])) #sheet.write(episode + 1, 3, str(episode_reward[2])) # for 2 #sheet.write(episode + 1, 4, str(episode_reward[3])) # for 3 #sheet.write(episode + 1, 5, str(episode_reward[4])) # for 4 episode += 1 observation = None #episode_step = None self.episode_step = None # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() file_name = 'result_v' + version + '.xls' # if (self.enable_double_dqn): # file_name = 'DDQN_' + file_name # if (self.enable_dueling_network): # file_name = 'Dueling_' + file_name workbook.save('../results/' + file_name) return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins." """ if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False ##### added by me ##### homedir = os.path.expanduser('~') newfile = homedir + '/cartpole_results/' + datetime.datetime.now( ).strftime('%Y%m%d%H%M%S') + '.csv' ##### up to here ##### while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} ##### added by me ##### obs = str(list(observation)).replace('[', '').replace(']', '') with open(newfile, 'a') as f: f.write(obs + '\n') ##### up to here ##### for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step( observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] # if verbose == 1: # callbacks += [TrainIntervalLogger(interval=log_interval)] # elif verbose > 1: # callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None nb_random_start_steps = 0 if nb_max_start_steps == 0 \ else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step( self._convert_action(action)) env.render() while info.get('env_status.env_state') is None: observation, r, done, info = env.step( self._convert_action(action)) env.render() observation = deepcopy(observation) if self.processor is not None: observation = \ self.processor.process_observation(observation) reward = self.processor.process_reward(reward) done = done[0] callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) K.set_learning_phase(1) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step( self._convert_action(action)) env.render() # while info.get('n')[0].get('env_status.env_state') is None: # observation, r, done, info = env.step( # self._convert_action(action)) # print(info) # print(info.get('env_status.env_state')) # env.render() observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) done = done[0] print(r, done, info) callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True K.set_learning_phase(1) metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode } print(action, reward) callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: K.set_learning_phase(1) self.forward(observation) K.set_learning_phase(1) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def _run(self, env, nb_steps=None, nb_episodes=None, train=True, exploration=True, action_repetition=1, callbacks=None, verbose=1, render=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, reward_scaling=1., plots=False, tensorboard=False, **kwargs): """ Run steps until termination. This method shouldn't be called directly, but instead called in :func:`fit` and :func:`test` Termination can be either: * Maximal number of steps * Maximal number of episodes :param nb_steps: Number of steps before termination. :param nb_episodes: Number of episodes before termination. :param bool training: Whether to train or test the agent. Not available for the :func:`fit` and :func:`test` methods. :param int action_repetition: Number of times the action is repeated for each step. :param callbacks: :param int verbose: 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging :param bool visualize: Render the environment in realtime. This slows down by a big factor (up to 100) the function. :param nb_max_start_steps: :param start_step_policy: (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. :param log_interval: :param reward_scaling: :param plots: Plot metrics during training. :param tensorboard: Export metrics to tensorboard. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `train()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) # Process the different cases when either nb_steps or nb_episodes are specified if (nb_steps is None and nb_episodes is None): raise (ValueError( "Please specify one (and only one) of nb_steps or nb_episodes") ) elif (nb_steps is not None and nb_episodes is None): termination_criterion = STEPS_TERMINATION elif (nb_steps is None and nb_episodes is not None): termination_criterion = EPISODES_TERMINATION elif (nb_steps is not None and nb_episodes is not None): raise (ValueError( "Please specify one (and only one) of nb_steps or nb_episodes") ) self.training = train # We explore only if the flag is selected and we are in train mode self.exploration = (train and exploration) # Initialize callbacks if callbacks is None: callbacks = [] if self.training: if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] else: if verbose >= 1: callbacks += [TestLogger()] callbacks = [] if not callbacks else callbacks[:] if render: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) if termination_criterion == STEPS_TERMINATION: params = { 'nb_steps': nb_steps, } elif termination_criterion == EPISODES_TERMINATION: params = { 'nb_episodes': nb_episodes, 'nb_steps': 1, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) # Add run hooks if tensorboard: from rl.hooks.tensorboard import TensorboardHook self.hooks.append(TensorboardHook(agent_id=self.id)) if plots: from rl.hooks.plot import PortraitHook, TrajectoryHook self.hooks.append(PortraitHook(agent_id=self.id)) self.hooks.append(TrajectoryHook(agent_id=self.id)) # Define the termination criterion # Step and episode at which we satrt the function start_step = self.step start_episode = self.episode if termination_criterion == STEPS_TERMINATION: def termination(): return (self.step - start_step >= nb_steps) elif termination_criterion == EPISODES_TERMINATION: def termination(): return ((self.episode - start_episode >= nb_episodes and self.done)) if self.training: self._on_train_begin() else: self._on_test_begin() callbacks.on_train_begin() # Setup self.run_number += 1 self.run_done = False self.done = True did_abort = False # Define these for clarification, not mandatory: # Where observation: Observation before the step # observation_1: Observation after the step self.observation = None self.observation_1 = None self.action = None self.step_summaries = None # Run_init hooks self.hooks.run_init() # Run steps (and episodes) until the termination criterion is met while not (self.run_done): # Init episode # If we are at the beginning of a new episode, execute a startup sequence if self.done: self.episode += 1 if self.training: self.training_episode += 1 self.episode_reward = 0. self.episode_step = 0 callbacks.on_episode_begin(self.episode) # Obtain the initial observation by resetting the environment. self.reset_states() observation_0 = deepcopy(env.reset()) assert observation_0 is not None # Perform random steps at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. if nb_max_start_steps != 0: observation_0 = self._perform_random_steps( nb_max_start_steps, start_step_policy, env, observation_0, callbacks) else: # We are in the middle of an episode # Update the observation observation_0 = self.observation_1 # Increment the episode step # FIXME: Use only one of the two variables self.observation = observation_0 # Increment the current step in both cases self.step += 1 if self.training: self.training_step += 1 self.episode_step += 1 self.reward = 0. self.step_summaries = [] accumulated_info = {} # Run a single step. callbacks.on_step_begin(self.episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # state_0 -- (foward) --> action self.action = self.forward(self.observation) # action -- (step) --> (reward, state_1, terminal) # Apply the action # With repetition, if necesarry for _ in range(action_repetition): callbacks.on_action_begin(self.action) self.observation_1, r, self.done, info = env.step(self.action) # observation_1 = deepcopy(observation_1) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(self.action) self.reward += r # Set episode as finished if the environment has terminated if self.done: break # Scale the reward self.reward = self.reward * reward_scaling self.episode_reward += self.reward # End of the step # Stop episode if reached the step limit if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps: # Force a terminal state. self.done = True # Post step: training, callbacks and hooks # Train the algorithm self.backward() # step_end Hooks self.hooks() # Callbacks # Collect statistics step_logs = { 'action': self.action, 'observation': self.observation_1, 'reward': self.reward, # For legacy callbacks upport 'metrics': [], 'episode': self.episode, 'info': accumulated_info, } callbacks.on_step_end(self.episode_step, step_logs) # Episodic callbacks if self.done: # Collect statistics episode_logs = { 'episode_reward': np.float_(self.episode_reward), 'nb_episode_steps': np.float_(self.episode_step), 'nb_steps': np.float_(self.step), } callbacks.on_episode_end(self.episode, logs=episode_logs) self.hooks.episode_end() # Stop run if termination criterion met if termination(): self.run_done = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() self.hooks.run_end() return (history)
def fit(self, env, nb_episodes, min_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [myTrainEpisodeLogger(self)] if visualize: callbacks += [Visualizer()] history = History() #callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, 'name': self.name, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() self.step = 0 episode = 0 while episode < nb_episodes or self.step < min_steps: callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. self.reset_states() observation = deepcopy(env.reset()) while True: callbacks.on_step_begin(episode_step) q_values = self.compute_q_values([observation ]) # only for windows 1 action = self.policy.select_action(q_values=q_values) self.recent_observation = observation self.recent_action = action callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) callbacks.on_action_end(action) if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'reward': reward, 'metrics': metrics, 'episode': episode, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 self.total_step += 1 if done: self.policy.log_qvalue(q_values) cur_maxq = self.qlogger.cur_maxq self.q_values = q_values if self.name == 'env': displayQvalue(q_values) self.reward_his.append(episode_reward) self.max_reward = max(self.max_reward, episode_reward) self.forward(observation) self.backward(0., terminal=False) break episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'q_value': q_values[action], 'q_max': cur_maxq, 'q_mean': np.mean(self.qlogger.mean_maxq) } callbacks.on_episode_end(episode, episode_logs) episode += 1 callbacks.on_train_end(logs={'did_abort': False}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) assert self.processor is None # Removed processors here for simplification. Not needed anyway assert nb_max_start_steps == 0 # Removed here for simplification. Not needed anyway assert action_repetition == 1 # Removed here for simplification. Not needed anyway self.agent1.training = True self.agent2.training = True experience_for_plotting = deque() callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation1 = observation2 = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation1 is None or observation2 is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() obs = env.reset() observation1 = deepcopy(obs) + (0.,) observation2 = deepcopy(obs) + (0.,) # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation1 is not None assert observation2 is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = np.ndarray.item(self.agent1.forward(observation1)) action2 = np.ndarray.item(self.agent2.forward(observation2)) action = (action1, action2) reward1 = np.float32(0) reward2 = np.float32(0) accumulated_info = {} done = False callbacks.on_action_begin(action) # Use only one of the actions? added actions? obs, r, done, info = env.step(action) if done: raise AttributeError # The episode was reset unexpectedly # (see https://stackoverflow.com/questions/42787924/) observation1 = deepcopy(obs) + (info["u2_clipped"],) # Add action other to the observation observation2 = deepcopy(obs) + (info["u1_clipped"],) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action[0] + action[1], 'observation': observation1, 'reward': reward1 + reward2, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if len(obs) == 2: experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.), r, (info["r1"], info["r2"]))) if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation1) self.agent2.forward(observation2) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation1 = None observation2 = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return experience_for_plotting
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=[], visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None): if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({ 'nb_episodes': nb_episodes, }) for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = env.reset() assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, _, done, _ = env.step(action) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = env.reset() break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) reward = 0. for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, _ = env.step(action) callbacks.on_action_end(action) reward += r if d: done = True break self.backward(reward, terminal=done) episode_reward += reward callbacks.on_step_end(episode_step) episode_step += 1 if nb_max_episode_steps and episode_step > nb_max_episode_steps: done = True episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs)
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({ 'nb_episodes': nb_episodes, }) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = env.reset() assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, _, done, _ = env.step(action) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = env.reset() break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) reward = 0. for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, _ = env.step(action) callbacks.on_action_end(action) reward += r if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward callbacks.on_step_end(episode_step) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=[], verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({ 'nb_steps': nb_steps, }) callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = env.reset() assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, _, done, _ = env.step(action) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = env.reset() break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) reward = 0. done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, _ = env.step(action) callbacks.on_action_end(action) reward += r if done: break metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done or (nb_max_episode_steps and episode_step > nb_max_episode_steps): # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort})
def _run(self, env, nb_steps=None, nb_episodes=None, training=True, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, reward_scaling=1.): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. nb_episodes (integer): Number of episodes to perform training (boolean): Whether to train or test the agent action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. reward_scaling (float): The amount with which the reward will be scaled # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) # Process the different cases when either nb_steps or nb_episodes are specified if (nb_steps is None and nb_episodes is None): raise (ValueError( "Please specify one (and only one) of nb_steps and nb_episodes" )) elif (nb_steps is not None and nb_episodes is None): termination_criterion = STEPS_TERMINATION elif (nb_steps is None and nb_episodes is not None): termination_criterion = EPISODES_TERMINATION elif (nb_steps is not None and nb_episodes is not None): raise (ValueError( "Please specify one (and only one) of nb_steps and nb_episodes" )) self.training = training # Initialize callbacks if callbacks is None: callbacks = [] if self.training: if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] else: if verbose >= 1: callbacks += [TestLogger()] callbacks = [] if not callbacks else callbacks[:] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) if termination_criterion == STEPS_TERMINATION: params = { 'nb_steps': nb_steps, } elif termination_criterion == EPISODES_TERMINATION: params = { 'nb_episodes': nb_episodes, 'nb_steps': 1, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) # Initialize the Hooks hooks = Hooks(self, [TensorboardHook(), PortraitHook(), TrajectoryHook()]) # Define the termination criterion # Step and episode at which we satrt the function start_step = self.step start_episode = self.episode if termination_criterion == STEPS_TERMINATION: def termination(): return (self.step - start_step > nb_steps) elif termination_criterion == EPISODES_TERMINATION: def termination(): return (self.episode - start_episode > nb_episodes) if self.training: self._on_train_begin() else: self._on_test_begin() callbacks.on_train_begin() # Setup self.done = True did_abort = False # Define these for clarification, not mandatory: # Where observation_0: Observation before the step # observation_1: Observation after the step observation_0 = None observation_1 = None self.step_summaries = None try: # Run steps (and episodes) until the termination criterion is met while not (termination()): # Init episode # If we are at the beginning of a new episode, execute a startup sequence if self.done: self.episode += 1 self.episode_reward = 0. self.episode_step = 0 callbacks.on_episode_begin(self.episode) # Obtain the initial observation by resetting the environment. self.reset_states() observation_0 = deepcopy(env.reset()) assert observation_0 is not None # Perform random steps at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. if nb_max_start_steps != 0: observation_0 = self._perform_random_steps( nb_max_start_steps, start_step_policy, env, observation_0, callbacks) else: # We are in the middle of an episode # Update the observation observation_0 = observation_1 # Increment the episode step # FIXME: Use only one of the two variables self.observation = observation_0 # Increment the current step in both cases self.step += 1 self.episode_step += 1 self.reward = 0. accumulated_info = {} # Run a single step. callbacks.on_step_begin(self.episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # state_0 -- (foward) --> action action = self.forward(observation_0) # Process the action action = self.processor.process_action(action) # action -- (step) --> (reward, state_1, terminal) # Apply the action # With repetition, if necesarry for _ in range(action_repetition): callbacks.on_action_begin(action) observation_1, r, self.done, info = env.step(action) # observation_1 = deepcopy(observation_1) observation_1, r, self.done, info = self.processor.process_step( observation_1, r, self.done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) self.reward += r # Set episode as finished if the environment has terminated if self.done: break # Scale the reward self.reward = self.reward * reward_scaling self.episode_reward += self.reward # End of the step # Stop episode if reached the step limit if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps: # Force a terminal state. self.done = True # Post step: training, callbacks and hooks # Train the algorithm metrics, self.step_summaries = self.backward( observation_0, action, self.reward, observation_1, terminal=self.done) # Hooks hooks() # Callbacks # Collect statistics step_logs = { 'action': action, 'observation': observation_1, 'reward': self.reward, 'metrics': metrics, 'episode': self.episode, 'info': accumulated_info, } callbacks.on_step_end(self.episode_step, step_logs) # Episodic callbacks if self.done: # Collect statistics episode_logs = { 'episode_reward': np.float_(self.episode_reward), 'nb_episode_steps': np.float_(self.episode_step), 'nb_steps': np.float_(self.step), } callbacks.on_episode_end(self.episode, logs=episode_logs) except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return (history)
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_episodes (integer): Number of episodes to perform. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.learner_processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step(observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) ## (CHECK!!!!!!!!!!!!!!) if self.processor is not None: observation, r, d, info = self.processor.process_step(observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, observation, info, env, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. # self.forward(observation) # self.backward(0., observation, terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def test( self, env, nb_episodes=1, action_repetition=1, callbacks=[], visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, ): if not self.compiled: raise RuntimeError( "Your tried to test your agent but it hasn't been compiled yet. Please call `compile()` before `test()`." ) if action_repetition < 1: raise ValueError("action_repetition must be >= 1, is {}".format(action_repetition)) self.training = False callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({"nb_episodes": nb_episodes}) for episode in xrange(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0.0 episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = env.reset() assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in xrange(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, _, done, _ = env.step(action) callbacks.on_action_end(action) if done: warnings.warn( "Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.".format( nb_random_start_steps ) ) observation = env.reset() break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) reward = 0.0 for _ in xrange(action_repetition): callbacks.on_action_begin(action) observation, r, d, _ = env.step(action) callbacks.on_action_end(action) reward += r if d: done = True break self.backward(reward, terminal=done) episode_reward += reward callbacks.on_step_end(episode_step) episode_step += 1 if nb_max_episode_steps and episode_step > nb_max_episode_steps: done = True episode_logs = {"episode_reward": episode_reward, "nb_steps": episode_step} callbacks.on_episode_end(episode, episode_logs)
class Agent(object): """Abstract base class for all implemented agents. Each agent interacts with the environment (as defined by the `Env` class) by first observing the state of the environment. Based on this observation the agent changes the environment by performing an action. Do not use this abstract base class directly but instead use one of the concrete agents implemented. Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same interface, you can use them interchangeably. To implement your own agent, you have to implement the following methods: - `forward` - `backward` - `compile` - `load_weights` - `save_weights` - `layers` # Arguments processor (`Processor` instance): See [Processor](#processor) for details. """ def __init__(self, processor=None): self.processor = processor self.training = False self.step = 0 def get_config(self): """Configuration of the agent for serialization. """ return {} def init_fit_parallel(self, nb_steps=10000, sampler_update_interval=500, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True self.training_callbacks = [ ] # if not self.training_callbacks else self.training_callbacks[:] if callbacks: self.training_callbacks += callbacks if verbose == 1: self.training_callbacks += [ TrainIntervalLogger(interval=log_interval) ] elif verbose > 1: self.training_callbacks += [TrainEpisodeLogger()] if visualize: self.training_callbacks += [Visualizer()] self.training_history = History() self.training_callbacks += [self.training_history] self.training_callbacks = CallbackList(self.training_callbacks) if hasattr(self.training_callbacks, 'set_model'): self.training_callbacks.set_model(self) else: self.training_callbacks._set_model(self) # # self.training_callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(self.training_callbacks, 'set_params'): self.training_callbacks.set_params(params) else: self.training_callbacks._set_params(params) self._on_train_begin() self.training_callbacks.on_train_begin() self.episode = 0 self.episode_step = 0 self.episode_reward = 0. self.step = 0 self.episode_fit_calls = 0 self.episode_backward_time = dt.timedelta() self.episode_n_backward_calls = 0 def fit_parallel(self, experience_list, sampler_update_interval=500, n_backward_calls=1): done = False if self.episode_step == 0 and len( experience_list) > 0: # start of a new "episode" self.training_callbacks.on_episode_begin(self.episode) try: self.training_callbacks.on_step_begin(self.episode_step) start_time = dt.datetime.now() metrics = self.backward(experience_list) for _ in range(n_backward_calls - 1): metrics = self.backward([]) end_time = dt.datetime.now() elapsed_time = end_time - start_time self.episode_backward_time += elapsed_time self.episode_n_backward_calls += n_backward_calls for e in experience_list: step_logs = { 'action': e.action, 'observation': e.state1, 'reward': e.reward, 'metrics': metrics, 'episode': self.episode, 'info': {}, 'done': e.terminal1, # 'experiences': len(experience_list) } if hasattr(e, "cumulativereward"): step_logs['cumulativereward'] = e.cumulativereward if hasattr(e, "workerid"): step_logs['workerid'] = e.workerid if hasattr(e, "epnum"): step_logs['epnum'] = e.epnum if hasattr(e, "seed"): step_logs['seed'] = e.seed self.training_callbacks.on_step_end(self.episode_step, step_logs) self.episode_step += 1 self.step += 1 self.episode_reward += e.reward self.episode_fit_calls += 1 if self.episode_step >= sampler_update_interval: done = True if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. #self.forward(observation) #self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': self.episode_reward, 'nb_episode_steps': self.episode_step, 'nb_steps': self.step, } self.training_callbacks.on_episode_end(self.episode, episode_logs) print( "Main Thread: episode: %d, # new experiences: %d, backward/experience: %.2f, backward/sec: %.2f, backward/fit: %d, backward/ep: %d" % (self.episode + 1, self.episode_step, float(self.episode_n_backward_calls) / self.episode_step, float(self.episode_n_backward_calls) / self.episode_backward_time.total_seconds(), n_backward_calls, self.episode_n_backward_calls)) self.episode += 1 self.episode_step = 0 self.episode_reward = 0. self.episode_backward_time = dt.timedelta() self.episode_n_backward_calls = 0 self.episode_fit_calls = 0 # episode += 1 # observation = None # episode_step = None # episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True # self.training_callbacks.on_train_end(logs={'did_abort': False}) # self._on_train_end() return None def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins." """ if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step( observation, r, d, info) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, observation, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. #self.forward(observation) #self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history def reset_states(self): """Resets all internally kept states after an episode is completed. """ pass def forward(self, observation): """Takes the an observation from the environment and returns the action to be taken next. If the policy is implemented by a neural network, this corresponds to a forward (inference) pass. # Argument observation (object): The current observation from the environment. # Returns The next action to be executed in the environment. """ raise NotImplementedError() def backward(self, reward, nextobservation, terminal): """Updates the agent after having executed the action returned by `forward`. If the policy is implemented by a neural network, this corresponds to a weight update using back-prop. # Argument reward (float): The observed reward after executing the action returned by `forward`. terminal (boolean): `True` if the new state of the environment is terminal. """ raise NotImplementedError() def compile(self, optimizer, metrics=[]): """Compiles an agent and the underlaying models to be used for training and testing. # Arguments optimizer (`keras.optimizers.Optimizer` instance): The optimizer to be used during training. metrics (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training. """ raise NotImplementedError() def load_weights(self, filepath): """Loads the weights of an agent from an HDF5 file. # Arguments filepath (str): The path to the HDF5 file. """ raise NotImplementedError() def save_weights(self, filepath, overwrite=False): """Saves the weights of an agent as an HDF5 file. # Arguments filepath (str): The path to where the weights should be saved. overwrite (boolean): If `False` and `filepath` already exists, raises an error. """ raise NotImplementedError() @property def layers(self): """Returns all layers of the underlying model(s). If the concrete implementation uses multiple internal models, this method returns them in a concatenated list. """ raise NotImplementedError() @property def metrics_names(self): """The human-readable names of the agent's metrics. Must return as many names as there are metrics (see also `compile`). """ return [] def _on_train_begin(self): """Callback that is called before training begins." """ pass def _on_train_end(self): """Callback that is called after training ends." """ pass def _on_test_begin(self): """Callback that is called before testing begins." """ pass def _on_test_end(self): """Callback that is called after testing ends." """ pass
def fit( self, env, nb_steps, action_repetition=1, callbacks=[], verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, ): if not self.compiled: raise RuntimeError( "Your tried to fit your agent but it hasn't been compiled yet. Please call `compile()` before `fit()`." ) if action_repetition < 1: raise ValueError("action_repetition must be >= 1, is {}".format(action_repetition)) self.training = True if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({"nb_steps": nb_steps}) callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0.0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = env.reset() assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in xrange(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, _, done, _ = env.step(action) callbacks.on_action_end(action) if done: warnings.warn( "Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.".format( nb_random_start_steps ) ) observation = env.reset() break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) reward = 0.0 done = False for _ in xrange(action_repetition): callbacks.on_action_begin(action) observation, r, done, _ = env.step(action) callbacks.on_action_end(action) reward += r if done: break metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { "action": action, "observation": observation, "reward": reward, "metrics": metrics, "episode": episode, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done or (nb_max_episode_steps and episode_step > nb_max_episode_steps): # This episode is finished, report and reset. episode_logs = { "episode_reward": episode_reward, "nb_episode_steps": episode_step, "nb_steps": self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={"did_abort": did_abort})
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, starting_checkpoints=[], avarage_q=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. starting_checkpoints ([string]): starting checkpoints file names. When the enviroment is reset one checkpoint from the list will be drawn at random and enviroment will start from that exact checkpoint. You can create the checkpoints using interactive_env.py. nb_max_episode_steps (dictionary): provide the options in order to messure avarage Q after the end of each episode. The metric will be added to the log as described at Playing Atari with Deep Reinforcement Learning. The start of the training may be delay as it takes some time to choose the evaluationg states. You can either provide the two following options or a True boolean for using the defaults: n_evaluations (integer): number of checkpoints to be evaluated and avaraged (default: 10). bernoulli (float): bernoulli parameter. If succeed, the step will be chosen as a checkpoint. The smaller this number the longer will take to select the checkpoints (default: 0.1). # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False episode_beginning = True try: self.collect_avarage_q_checkpoints(env, avarage_q, starting_checkpoints) while self.step < nb_steps: if observation is None: # start of a new episode episode_beginning = True callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() if starting_checkpoints: checkpoint = np.random.choice(starting_checkpoints) observation = deepcopy( env.reset(checkpoint='checkpoints/{}'.format( checkpoint))) else: observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in xrange(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False # NOTA-EZE: Esto agrega complejidad al pe*o. El frameskip lo implementamos en el emulador for _ in xrange(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) # for key, value in info.items(): # if not np.isreal(value): # continue # if key not in accumulated_info: # accumulated_info[key] = np.zeros_like(value) # accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # if self.memory.__class__.__name__ == 'PrioritizedMemory': # self.memory.append_with_error(observation, action, reward, done, episode_beginning) metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) # if self.memory.__class__.__name__ == 'PrioritizedMemory': # self.memory.append_with_error(observation) # if self.memory.__class__.__name__ == 'EfficientPriorizatedMemory': # self.memory.append(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'global_score': info["global_score"] } if self.memory.is_prioritized(): episode_logs['max_error_PER'] = self.memory.maximum episode_logs['average_error_PER'] = self.memory.average self.memory.reset_metrics() if starting_checkpoints: episode_logs['checkpoint'] = checkpoint callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=2000000, nb_max_episode_steps=None, nb_episodes=10000): self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 episode_reward = 0 episode_step = 0 did_abort = False if load_weight: self.load_weights(file_path="") if self.training: self.epsilon = self.startE else: self.epsilon = self.evaluateE try: while self.step < nb_steps: callbacks.on_episode_begin(episode) # Obtain the initial observation by resetting the environment. observation = env.env.getState() if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None assert episode_reward is not None assert episode_step is not None callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation, env) reward = 0. accumulated_info = {} callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) reward += r metrics = self.backward(reward, terminal=done) episode_reward += reward print 'reward: ' + str(reward) step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode_step = 0 episode_reward = 0 episode += 1 env.reset() if np.mod(episode, 10) == 0 and self.training: self.save_weights(file_path="", overwrite=True) except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): for dqagent in self.dqagents: if not dqagent.compiled: raise RuntimeError( 'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observations = [] episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: # check if observations is empty if observations == []: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32([0,0]) # Obtain the initial observation by resetting the environment. self.dqagent[0].reset_states() self.dqagent[1].reset_states() observations = deepcopy(env.reset()) if self.processor is not None: # process all observations observations = [self.processor.process_observation( observation) for observation in observations] assert observations != [] # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. # can remove this bit, not gonna use any random starts nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: actions = env.action_space.sample() else: actions = start_step_policy(observation) if self.processor is not None: actions = self.processor.process_action(action) callbacks.on_action_begin(action) observations, rewards, done, info = env.step(action) observations = deepcopy(observations) if self.processor is not None: observations, rewards, done, info = self.processor.process_step( observations, rewards, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observations = deepcopy(env.reset()) if self.processor is not None: observations = [self.processor.process_observation( observation) for observation in observations] break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observations != [] # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # given incides [0,3] are hider indices and [4,5] are seeker indices actions = [] for i in range(2,6): actions.append(self.dqagents[0].forward(observations[i])) for i in range(0,2): actions.append(self.dqagents[1].forward(observations[i])) # process all actions if self.processor is not None: actions = [self.processor.process_action(action) for action in actions] rewards = np.float32([0,0]) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(actions) # expect rs[0] to be aggregate hider reward, rs[1] aggregate seeker reward observations, rs, done, info = env.step(actions) observations = deepcopy(observations) if self.processor is not None: observations, rs, done, info = self.processor.process_step( observations, rs, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(actions) hider_reward += rs[0] seeker_reward += rs[1] rewards += rs if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # run backwrd step wrt each agent's respective aggregate reward hider_metrics = self.dqagents[0].backward(hider_reward, terminal=done) seeker_metrics = self.dqagents[1].backward(seeker_reward, terminal=done) episode_reward += rewards step_logs = { 'actions': actions, 'observations': observations, 'hider_reward': hider_reward, 'hider_metrics': hider_metrics, 'seeker_reward': seeker_reward, 'seeker_metrics': seeker_metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. for i in range(2,6): self.dqagents[0].forward(observations[i]) for i in range(0,2): self.dqagents[1].forward(observations[i]) self.dqagents[0].backward(0., terminal=False) self.dqagents[1].backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observations = [] episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_episodes (integer): Number of episodes to perform. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ for dqagent in self.dqagents: if not dqagent.compiled: raise RuntimeError( 'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = [0,0] episode_step = 0 # Obtain the initial observation by resetting the environment. self.dqagent[0].reset_states() self.dqagent[1].reset_states() observations = deepcopy(env.reset()) if self.processor is not None: observations = [self.processor.process_observation(observation) for observation in observations] assert observations != [] # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. # this is never executed with default args. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observations, rs, done, info = env.step(action) observations = deepcopy(observations) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) actions = [] for i in range(2,6): actions.append(self.dqagents[0].forward(observations[i])) for i in range(0,2): actions.append(self.dqagents[1].forward(observations[i])) if self.processor is not None: actions = [self.processor.process_action(action) for action in actions] rewards = [0.,0.] accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observations, rs, d, info = env.step(actions) observations = deepcopy(observations) if self.processor is not None: observations, rs, d, info = self.processor.process_step( observations, rs, d, info) callbacks.on_action_end(actions) hider_reward += rs[0] seeker_reward += rs[1] rewards += rs for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.dqagent[0].backward(hider_reward, terminal=done) self.dqagent[1].backward(seeker_reward, terminal=done) episode_reward += rewards step_logs = { 'action': actions, 'observation': observations, 'rewards': rewards, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. for i in range(2,6): self.dqagent[0].forward(observations[i]) for i in range(0,2): self.dqagent[1].forward(observations[i]) self.dqagent[0].backward(0., terminal=False) self.dqagent[1].backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history def _on_train_begin(self): """Callback that is called before training begins." """ pass def _on_train_end(self): """Callback that is called after training ends." """ pass def _on_test_begin(self): """Callback that is called before testing begins." """ pass def _on_test_end(self): """Callback that is called after testing ends." """ pass class MultiProcessor(object): """Abstract base class for implementing processors. A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can be necessary if your agent has different requirements with respect to the form of the observations, actions, and rewards of the environment. By implementing a custom processor, you can effectively translate between the two without having to change the underlaying implementation of the agent or environment. Do not use this abstract base class directly but instead use one of the concrete implementations or write your own. """ def process_step(self, observations, rewards, done, info): """Processes an entire step by applying the processor to the observation, reward, and info arguments. # Arguments observation (object): An observation as obtained by the environment. reward (float): A reward as obtained by the environment. done (boolean): `True` if the environment is in a terminal state, `False` otherwise. info (dict): The debug info dictionary as obtained by the environment. # Returns The tupel (observation, reward, done, reward) with with all elements after being processed. """ observations = [self.process_observation(observation) for observation in observations] rewards = [self.process_reward(reward) for reward in rewards] info = self.process_info(info) return observations, rewards, done, info def process_observation(self, observation): """Processes the observation as obtained from the environment for use in an agent and returns it. # Arguments observation (object): An observation as obtained by the environment # Returns Observation obtained by the environment processed """ return observation def process_reward(self, reward): """Processes the reward as obtained from the environment for use in an agent and returns it. # Arguments reward (float): A reward as obtained by the environment # Returns Reward obtained by the environment processed """ return reward def process_info(self, info): """Processes the info as obtained from the environment for use in an agent and returns it. # Arguments info (dict): An info as obtained by the environment # Returns Info obtained by the environment processed """ return info def process_action(self, action): """Processes an action predicted by an agent but before execution in an environment. # Arguments action (int): Action given to the environment # Returns Processed action given to the environment """ return action def process_state_batch(self, batch): """Processes an entire batch of states and returns it. # Arguments batch (list): List of states # Returns Processed list of states """ return batch @property def metrics(self): """The metrics of the processor, which will be reported during training. # Returns List of `lambda y_true, y_pred: metric` functions. """ return [] @property def metrics_names(self): """The human-readable names of the agent's metrics. Must return as many names as there are metrics (see also `compile`). """ return []
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, episode_averaging_length=10, success_threshold=None, stopping_patience=None, min_nb_steps=500, single_cycle=True): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] for cb in callbacks: if isinstance(cb, FileLogger): save_path = cb.filepath folder_index = save_path.index("training_history.json") weights_file = os.path.join(save_path[:folder_index], "dqn_weights.h5f") if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger(interval=log_interval)] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None episode_num_errors = None did_abort = False # ------ Early stopping and reporting averages ------------------ # # It would be ideal to do this via a callback, but returning flags from callbacks seems tricky. Eish! # So, we automatically include early stopping here in the fit method. # NB: We have hardcoded in something which is probably not ideal to hard code, but I just want it # to work, and can fix things and make them nicer/more flexible at a later stage! # # -------------------------------------------------------------- if not single_cycle: recent_episode_lifetimes = deque([], episode_averaging_length) episode_lifetimes_rolling_avg = 0 best_rolling_avg = 0 best_episode = 0 time_since_best = 0 elif single_cycle: recent_episode_wins = deque([], episode_averaging_length) best_rolling_avg = 0 best_episode = 0 time_since_best = 0 rolling_win_fraction = 0 stop_training = False has_succeeded = False stopped_improving = False try: while self.step < nb_steps and not stop_training: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) # print("Episode Step:", episode_step) # print("hidden state: ") # print(env.hidden_state) # print("Board State: ") # print(observation) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # print("Episode Step:", episode_step) # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). if hasattr(env, "legal_actions"): legal_actions = list(env.legal_actions) action = self.forward(observation, legal_actions) # print("legal actions: ", legal_actions) # print("chosen action: ", action) else: action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward # print("new hidden state: ") # print(env.hidden_state) # print("new board state: ") # print(observation) # print("reward: ", r, "episode reward: ", episode_reward) # print("done: ", done) step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. action = self.forward(observation) self.backward(0., terminal=False) # Now we want to work out the recent averages, this will go into early stopping if not single_cycle: recent_episode_lifetimes.append(env.lifetime) episode_lifetimes_rolling_avg = np.mean( recent_episode_lifetimes) if episode_lifetimes_rolling_avg > best_rolling_avg: best_rolling_avg = episode_lifetimes_rolling_avg best_episode = episode time_since_best = 0 else: time_since_best = episode - best_episode if episode_lifetimes_rolling_avg > success_threshold: stop_training = True has_succeeded = True if self.step > min_nb_steps and time_since_best > stopping_patience: stop_training = True stopped_improving = True else: if episode_reward == 1: recent_episode_wins.append(1) else: recent_episode_wins.append(0) num_wins = np.sum(recent_episode_wins) rolling_win_fraction = num_wins / episode_averaging_length if rolling_win_fraction > best_rolling_avg: best_rolling_avg = rolling_win_fraction best_episode = episode time_since_best = 0 # Here I need to add something to save the net - I'm worried this will make things really slow while its improving, because it will be saving every time # For a long time. Eish! if self.step > min_nb_steps: self.save_weights(weights_file, overwrite=True) else: time_since_best = episode - best_episode if rolling_win_fraction > success_threshold: stop_training = True has_succeeded = True if self.step > min_nb_steps and time_since_best > stopping_patience: stop_training = True stopped_improving = True # This episode is finished, report and reset. if not single_cycle: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg, 'best_rolling_avg': best_rolling_avg, 'best_episode': best_episode, 'time_since_best': time_since_best, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving } else: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'rolling_win_fraction': rolling_win_fraction, 'best_rolling_fraction': best_rolling_avg, 'best_episode': best_episode, 'time_since_best': time_since_best, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving } callbacks.on_episode_end(episode, episode_logs, single_cycle) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True if not single_cycle: callbacks.on_train_end(logs={ 'did_abort': did_abort, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving, 'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg, 'step': self.step }, single_cycle=single_cycle) else: callbacks.on_train_end(logs={ 'did_abort': did_abort, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving, 'rolling_win_fraction': rolling_win_fraction, 'step': self.step }, single_cycle=single_cycle) self._on_train_end() return history