def init_fit_parallel(self, nb_steps=10000, sampler_update_interval=500, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True self.training_callbacks = [ ] # if not self.training_callbacks else self.training_callbacks[:] if callbacks: self.training_callbacks += callbacks if verbose == 1: self.training_callbacks += [ TrainIntervalLogger(interval=log_interval) ] elif verbose > 1: self.training_callbacks += [TrainEpisodeLogger()] if visualize: self.training_callbacks += [Visualizer()] self.training_history = History() self.training_callbacks += [self.training_history] self.training_callbacks = CallbackList(self.training_callbacks) if hasattr(self.training_callbacks, 'set_model'): self.training_callbacks.set_model(self) else: self.training_callbacks._set_model(self) # # self.training_callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(self.training_callbacks, 'set_params'): self.training_callbacks.set_params(params) else: self.training_callbacks._set_params(params) self._on_train_begin() self.training_callbacks.on_train_begin() self.episode = 0 self.episode_step = 0 self.episode_reward = 0. self.step = 0 self.episode_fit_calls = 0 self.episode_backward_time = dt.timedelta() self.episode_n_backward_calls = 0
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=2000000, nb_max_episode_steps=None, nb_episodes=10000): self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 episode_reward = 0 episode_step = 0 did_abort = False if load_weight: self.load_weights(file_path="") if self.training: self.epsilon = self.startE else: self.epsilon = self.evaluateE try: while self.step < nb_steps: callbacks.on_episode_begin(episode) # Obtain the initial observation by resetting the environment. observation = env.env.getState() if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None assert episode_reward is not None assert episode_step is not None callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation, env) reward = 0. accumulated_info = {} callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) reward += r metrics = self.backward(reward, terminal=done) episode_reward += reward print 'reward: ' + str(reward) step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode_step = 0 episode_reward = 0 episode += 1 env.reset() if np.mod(episode, 10) == 0 and self.training: self.save_weights(file_path="", overwrite=True) except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): for dqagent in self.dqagents: if not dqagent.compiled: raise RuntimeError( 'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format( action_repetition)) self.dqagents[0].training = True self.dqagents[1].training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() #callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observations = [] episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: # check if observations is empty if observations == []: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32([0, 0]) # Obtain the initial observation by resetting the environment. self.dqagents[0].reset_states() self.dqagents[1].reset_states() observations = deepcopy(env.reset()) if self.processor is not None: # process all observations observations = [ self.processor.process_observation(observation) for observation in observations ] assert observations != [] # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. # can remove this bit, not gonna use any random starts nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: actions = env.action_space.sample() else: actions = start_step_policy(observation) if self.processor is not None: actions = self.processor.process_action(action) callbacks.on_action_begin(action) observations, rewards, done, info = env.step(action) observations = deepcopy(observations) if self.processor is not None: observations, rewards, done, info = self.processor.process_step( observations, rewards, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observations = deepcopy(env.reset()) if self.processor is not None: observations = [ self.processor.process_observation( observation) for observation in observations ] break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observations != [] # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # given incides [0,3] are hider indices and [4,5] are seeker indices actions = [] for i in range(2, 6): actions.append(self.dqagents[0].forward(observations[i])) for i in range(0, 2): actions.append(self.dqagents[1].forward(observations[i])) # process all actions if self.processor is not None: actions = [ self.processor.process_action(action) for action in actions ] rewards = np.float32([0, 0]) hider_reward = np.float32(0) seeker_reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(actions) # expect rs[0] to be aggregate hider reward, rs[1] aggregate seeker reward observations, rs, done, info = env.step(actions) observations = deepcopy(observations) if self.processor is not None: observations, rs, done, info = self.processor.process_step( observations, rs, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(actions) hider_reward += rs[0] seeker_reward += rs[1] rewards += rs if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # run backwrd step wrt each agent's respective aggregate reward hider_metrics = self.dqagents[0].backward(hider_reward, terminal=done) seeker_metrics = self.dqagents[1].backward(seeker_reward, terminal=done) episode_reward += rewards step_logs = { 'actions': actions, 'observations': observations, 'hider_reward': hider_reward, 'hider_metrics': hider_metrics, 'seeker_reward': seeker_reward, 'metrics': seeker_metrics, 'reward': seeker_reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 self.dqagents[0].step += 1 self.dqagents[1].step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. for i in range(2, 6): self.dqagents[0].forward(observations[i]) for i in range(0, 2): self.dqagents[1].forward(observations[i]) self.dqagents[0].backward(0., terminal=False) self.dqagents[1].backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'epoch': 1 } # callbacks.on_episode_end(episode, episode_logs) episode += 1 observations = [] episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True # callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def main(model_name, options): # Initialize maze environments. env = gym.make('Pong-v0') #env = gym.make('Taxi-v2') envs = [env] # Setting hyperparameters. nb_actions = env.action_space.n maze_dim = (6400, 1) h_size = 64 # For DQN e_t_size = 64 #For MQN / RMQN context_size = 64 nb_steps_warmup = int(1e5) nb_steps = int(4e5) buffer_size = 8e4 learning_rate = 0.003 target_model_update = 0.999 clipnorm = 10. switch_rate = 50 window_length = 12 memory_size = None # Callbacks log = TrainEpisodeLogger() #tensorboard = TensorBoard(log_dir="./logs/{}".format(model_name)) rl_tensorboard = RLTensorBoard(log_dir="./logs/{}".format(model_name), histogram_freq=100) callbacks = [log, rl_tensorboard] ### Models ### model = None target_model = None # MQN model. if "MQN" in options: memory_size = 12 model = MQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) target_model = MQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) # RMQN model. if "RMQN" in options: memory_size = 12 model = RMQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) target_model = RMQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) # Distributional MQN model. nb_atoms = 51 v_min = -2. v_max = 2. #model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) #target_model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) # DQN model if "DQN" in options: model = DQNmodel(nb_actions, window_length, h_size, maze_dim) target_model = DQNmodel(nb_actions, window_length, h_size, maze_dim) # Initialize our target model with the same weights as our model. target_model.set_weights(model.get_weights()) # Initialize memory buffer for DQN algorithm. experience = [ SequentialMemory(limit=int(buffer_size / len(envs)), window_length=window_length) for i in range(len(envs)) ] # Learning policy where we initially begin training our agent by making random moves # with a probability of 1, and linearly decrease that probability down to 0.1 over the # course of some arbitrary number of steps. (nb_steps) policy = LinearAnnealedPolicy(inner_policy=EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.1, value_test=0., nb_steps=1e5) # Optional processor. processor = PongProcessor() # processor = MazeProcessor() # Initialize and compile the DQN agent. dqn = DQNAgent(model=model, target_model=target_model, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, processor=processor, batch_size=8) #Initialize experimental Distributional DQN Agent ''' dqn = DistributionalDQNAgent( model=model, target_model=target_model, num_atoms=nb_atoms, v_min=v_min, v_max=v_max, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, #processor=processor, batch_size=32 ) ''' # Compile the agent to check for validity, build tensorflow graph, etc. dqn.compile(RMSprop(lr=learning_rate, clipnorm=clipnorm), metrics=["mae"]) # Weights will be loaded if weight file exists. if os.path.exists("data/{}/{}".format(model_name, model_name + ".h5")): dqn.load_weights("data/{}/{}".format(model_name, model_name + ".h5")) # Train DQN in environment. if "train" in options: dqn.fit(env, nb_steps=nb_steps, verbose=0, callbacks=callbacks) # Visualization / Logging Tools logmetrics(log, model_name) logHyperparameters(model_name, e_t_size=e_t_size, context_size=context_size, h_size=h_size, memory_size=memory_size, learning_rate=learning_rate, target_model_update=target_model_update, clipnorm=clipnorm, window_length=window_length, nb_atoms=nb_atoms, v_min=v_min, v_max=v_max) # Save weights. dqn.save_weights("data/{}/{}".format(model_name, model_name + ".h5")) # Test DQN in environment. if "test" in options: dqn.test(env, nb_episodes=100, visualize=True) #Debugging if "debug" in options: observation = env.reset() outputLayer(dqn.model, np.array(experience[0].sample(32)[0].state0)) #visualizeLayer(dqn.model, dqn.layers[1], observation) return
nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1., n_step=3, custom_model_objects={"NoisyNetDense": NoisyNetDense}) dqn.compile(Adam(lr=.00025 / 4), metrics=['mae']) folder_path = 'models/NoisyNSteps/' weights_filename = folder_path + 'final_noisynet_nstep_pdd_dqn_MsPacmanDeterministic-v4_weights.h5f' checkpoint_weights_filename = folder_path + 'final_noisynet_nstep_dqn_MsPacmanDeterministic-v4_weights_{step}.h5f' log_filename = folder_path + 'final_noisynet_nstep_dqn_MsPacmanDeterministic-v4_REWARD_DATA.txt' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=5000000) ] callbacks += [TrainEpisodeLogger(log_filename)] dqn.fit(env, callbacks=callbacks, nb_steps=30000000, verbose=0, nb_max_episode_steps=20000)
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): for dqagent in self.dqagents: if not dqagent.compiled: raise RuntimeError( 'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observations = [] episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: # check if observations is empty if observations == []: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32([0,0]) # Obtain the initial observation by resetting the environment. self.dqagent[0].reset_states() self.dqagent[1].reset_states() observations = deepcopy(env.reset()) if self.processor is not None: # process all observations observations = [self.processor.process_observation( observation) for observation in observations] assert observations != [] # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. # can remove this bit, not gonna use any random starts nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: actions = env.action_space.sample() else: actions = start_step_policy(observation) if self.processor is not None: actions = self.processor.process_action(action) callbacks.on_action_begin(action) observations, rewards, done, info = env.step(action) observations = deepcopy(observations) if self.processor is not None: observations, rewards, done, info = self.processor.process_step( observations, rewards, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observations = deepcopy(env.reset()) if self.processor is not None: observations = [self.processor.process_observation( observation) for observation in observations] break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observations != [] # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # given incides [0,3] are hider indices and [4,5] are seeker indices actions = [] for i in range(2,6): actions.append(self.dqagents[0].forward(observations[i])) for i in range(0,2): actions.append(self.dqagents[1].forward(observations[i])) # process all actions if self.processor is not None: actions = [self.processor.process_action(action) for action in actions] rewards = np.float32([0,0]) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(actions) # expect rs[0] to be aggregate hider reward, rs[1] aggregate seeker reward observations, rs, done, info = env.step(actions) observations = deepcopy(observations) if self.processor is not None: observations, rs, done, info = self.processor.process_step( observations, rs, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(actions) hider_reward += rs[0] seeker_reward += rs[1] rewards += rs if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # run backwrd step wrt each agent's respective aggregate reward hider_metrics = self.dqagents[0].backward(hider_reward, terminal=done) seeker_metrics = self.dqagents[1].backward(seeker_reward, terminal=done) episode_reward += rewards step_logs = { 'actions': actions, 'observations': observations, 'hider_reward': hider_reward, 'hider_metrics': hider_metrics, 'seeker_reward': seeker_reward, 'seeker_metrics': seeker_metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. for i in range(2,6): self.dqagents[0].forward(observations[i]) for i in range(0,2): self.dqagents[1].forward(observations[i]) self.dqagents[0].backward(0., terminal=False) self.dqagents[1].backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observations = [] episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_episodes (integer): Number of episodes to perform. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ for dqagent in self.dqagents: if not dqagent.compiled: raise RuntimeError( 'Your tried to fit your agents but one hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = [0,0] episode_step = 0 # Obtain the initial observation by resetting the environment. self.dqagent[0].reset_states() self.dqagent[1].reset_states() observations = deepcopy(env.reset()) if self.processor is not None: observations = [self.processor.process_observation(observation) for observation in observations] assert observations != [] # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. # this is never executed with default args. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observations, rs, done, info = env.step(action) observations = deepcopy(observations) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) actions = [] for i in range(2,6): actions.append(self.dqagents[0].forward(observations[i])) for i in range(0,2): actions.append(self.dqagents[1].forward(observations[i])) if self.processor is not None: actions = [self.processor.process_action(action) for action in actions] rewards = [0.,0.] accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observations, rs, d, info = env.step(actions) observations = deepcopy(observations) if self.processor is not None: observations, rs, d, info = self.processor.process_step( observations, rs, d, info) callbacks.on_action_end(actions) hider_reward += rs[0] seeker_reward += rs[1] rewards += rs for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.dqagent[0].backward(hider_reward, terminal=done) self.dqagent[1].backward(seeker_reward, terminal=done) episode_reward += rewards step_logs = { 'action': actions, 'observation': observations, 'rewards': rewards, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. for i in range(2,6): self.dqagent[0].forward(observations[i]) for i in range(0,2): self.dqagent[1].forward(observations[i]) self.dqagent[0].backward(0., terminal=False) self.dqagent[1].backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history def _on_train_begin(self): """Callback that is called before training begins." """ pass def _on_train_end(self): """Callback that is called after training ends." """ pass def _on_test_begin(self): """Callback that is called before testing begins." """ pass def _on_test_end(self): """Callback that is called after testing ends." """ pass class MultiProcessor(object): """Abstract base class for implementing processors. A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can be necessary if your agent has different requirements with respect to the form of the observations, actions, and rewards of the environment. By implementing a custom processor, you can effectively translate between the two without having to change the underlaying implementation of the agent or environment. Do not use this abstract base class directly but instead use one of the concrete implementations or write your own. """ def process_step(self, observations, rewards, done, info): """Processes an entire step by applying the processor to the observation, reward, and info arguments. # Arguments observation (object): An observation as obtained by the environment. reward (float): A reward as obtained by the environment. done (boolean): `True` if the environment is in a terminal state, `False` otherwise. info (dict): The debug info dictionary as obtained by the environment. # Returns The tupel (observation, reward, done, reward) with with all elements after being processed. """ observations = [self.process_observation(observation) for observation in observations] rewards = [self.process_reward(reward) for reward in rewards] info = self.process_info(info) return observations, rewards, done, info def process_observation(self, observation): """Processes the observation as obtained from the environment for use in an agent and returns it. # Arguments observation (object): An observation as obtained by the environment # Returns Observation obtained by the environment processed """ return observation def process_reward(self, reward): """Processes the reward as obtained from the environment for use in an agent and returns it. # Arguments reward (float): A reward as obtained by the environment # Returns Reward obtained by the environment processed """ return reward def process_info(self, info): """Processes the info as obtained from the environment for use in an agent and returns it. # Arguments info (dict): An info as obtained by the environment # Returns Info obtained by the environment processed """ return info def process_action(self, action): """Processes an action predicted by an agent but before execution in an environment. # Arguments action (int): Action given to the environment # Returns Processed action given to the environment """ return action def process_state_batch(self, batch): """Processes an entire batch of states and returns it. # Arguments batch (list): List of states # Returns Processed list of states """ return batch @property def metrics(self): """The metrics of the processor, which will be reported during training. # Returns List of `lambda y_true, y_pred: metric` functions. """ return [] @property def metrics_names(self): """The human-readable names of the agent's metrics. Must return as many names as there are metrics (see also `compile`). """ return []
def main(weights_file, options): #Initialize maze environments. env = gym.make('IMaze3-v0') #env = gym.make('Taxi-v2') envs = [env] #Setting hyperparameters. nb_actions = env.action_space.n obs_dimensions = env.observation_space.n e_t_size = 48 context_size = 48 nb_steps_warmup = int(5e4) nb_steps = int(4e6) buffer_size = 5e4 learning_rate = 1e-3 target_model_update = 0.999 clipnorm = 10. switch_rate = 50 window_length = 12 #Callbacks log = TrainEpisodeLogger() callbacks = [log] #MQN model. #model = MQNmodel(e_t_size, context_size, window_length, nb_actions) #target_model = MQNmodel(e_t_size, context_size, window_length, nb_actions) #Distributional MQN model. nb_atoms = 51 v_min = -2. v_max = 2. model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) target_model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) #DQN model #model = DQNmodel(nb_actions, window_length, input_shape=env.maze.matrix.shape) #target_model = DQNmodel(nb_actions, window_length, input_shape=env.maze.matrix.shape) #Initialize our target model with the same weights as our model. target_model.set_weights(model.get_weights()) #Initialize memory buffer for DQN algorithm. experience = [ SequentialMemory(limit=int(buffer_size / len(envs)), window_length=window_length) for i in range(len(envs)) ] #Learning policy where we initially begin training our agent by making random moves #with a probability of 1, and linearly decrease that probability down to 0.1 over the #course of some arbitrary number of steps. (nb_steps) policy = LinearAnnealedPolicy(inner_policy=EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.1, value_test=0., nb_steps=7e4) #Optional processor. #processor = TaxiProcessor() #Initialize and compile the DQN agent. ''' dqn = DQNAgent(model=model, target_model=target_model, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, batch_size=32) ''' #Initialize experimental Distributional DQN Agent dqn = DistributionalDQNAgent( model=model, target_model=target_model, num_atoms=nb_atoms, v_min=v_min, v_max=v_max, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, #processor=processor, batch_size=32) #Compile the agent to check for validity, build tensorflow graph, etc. dqn.compile(Adam(lr=learning_rate, clipnorm=clipnorm), metrics=["mae"]) #Extract model name from weights file. model_name = weights_file.split(".")[0] #Weights will be loaded if weight file exists. if os.path.exists("data/{}/{}".format(model_name, weights_file)): dqn.load_weights("data/{}/{}".format(model_name, weights_file)) #Train DQN in environment. if "train" in options: dqn.fit(env, nb_steps=nb_steps, verbose=0, callbacks=callbacks) #Visualization / Logging Tools logmetrics(log, model_name) logHyperparameters(model_name, e_t_size=e_t_size, context_size=context_size, learning_rate=learning_rate, target_model_update=target_model_update, clipnorm=clipnorm, window_length=window_length, nb_atoms=nb_atoms, v_min=v_min, v_max=v_max) #Save weights. dqn.save_weights("data/{}/{}".format(model_name, weights_file)) #Test DQN in environment. if "test" in options: dqn.test(env, nb_episodes=100, visualize=True, dump_output=True) #Debugging if "debug" in options: observation = env.reset() outputLayer(dqn.model, np.array(experience[0].sample(32)[0].state0)) #visualizeLayer(dqn.model, dqn.layers[1], observation) return
def fit(self, env, env_1, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, file_interval=200,nb_max_episode_steps=None,save_data_path='temp.json', dynamic_actor_exploration=False, update_exploration_interval=5000): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.episode_goal = None # (resets every episode) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] callbacks += [OjasFileLogger(save_data_path,interval=file_interval)] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation1 = None observation2 = None episode_reward1 = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation1 is None or observation2 is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward1 = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation1 = deepcopy(env.reset()) observation2 = deepcopy(env_1.reset()) if self.actor_processor is not None: # observation1 = self.learner_processor.process_observation(observation1) observation1 = self.actor_processor.process_observation(observation1) observation2 = self.actor_processor.process_observation(observation2) assert observation1 is not None assert observation2 is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action1 = env.action_space.sample() action2 = env_1.action_space.sample() else: action1 = start_step_policy(observation1) action2 = start_step_policy(observation2) if self.actor_processor is not None: # action1 = self.learner_processor.process_action(action1) action1 = self.actor_processor.process_action(action1) action2 = self.actor_processor.process_action(action2) callbacks.on_action_begin(action1) observation1, reward1, done1, info1 = env.step(action1) observation2, reward2, done2, info2 = env_1.step(action2) observation1 = deepcopy(observation1) observation2 = deepcopy(observation2) if self.actor_processor is not None: # observation1, reward1, done1, info1 = self.learner_processor.process_step(observation1, reward1, done1, info1) observation1, reward1, done1, info1 = self.actor_processor.process_step(observation1, reward1, done1, info1) observation2, reward2, done2, info2 = self.actor_processor.process_step(observation2, reward2, done2, info2) callbacks.on_action_end(action1) if done1: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation1 = deepcopy(env.reset()) # if self.learner_processor is not None: if self.actor_processor is not None: # observation1 = self.learner_processor.process_observation(observation1) observation1 = self.actor_processor.process_observation(observation1) break if done2: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) observation2 = deepcopy(env_1.reset()) if self.actor_processor is not None: observation2 = self.actor_processor.process_observation(observation2) break # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_step is not None assert observation1 is not None assert observation2 is not None # Run a single step. callbacks.on_step_begin(episode_step) # (Prints here if verbose = 1) # This is where all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1, action2 = self.forward(observation1, observation2) if self.actor_processor is not None: # action1 = self.learner_processor.process_action(action1) action1 = self.actor_processor.process_action(action1) action2 = self.actor_processor.process_action(action2) reward1 = 0. reward2 = 0. accumulated_info = {} done1 = False done2 = False for _ in range(action_repetition): callbacks.on_action_begin(action1) observation1, r1, done1, info1 = env.step(action1) observation1 = deepcopy(observation1) # if self.learner_processor is not None: # observation1, r1, done1, info1 = self.learner_processor.process_step(observation1, r1, done1, info1) if self.actor_processor is not None: observation1, r1, done1, info1 = self.actor_processor.process_step(observation1, r1, done1, info1) for key, value in info1.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action1) reward1 += r1 if done1: break for _ in range(action_repetition): observation2, r2, done2, info2 = env_1.step(action2) observation2 = deepcopy(observation2) if self.actor_processor is not None: observation2, r2, done2, info2 = self.actor_processor.process_step(observation2, r2, done2, info2) reward2 += r2 if done2: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. (both agents take every step parallely) done1 = True done2 = True self.backward_actor(reward1, observation1, info1, reward2, observation2, info2, env, terminal1=done1, terminal2=done2) metrics = self.backward_learner() episode_reward1 += reward1 step_logs = { 'action': action1, 'observation': observation1, 'reward': reward1, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) ## stores the current step info if(self.step%update_exploration_interval and dynamic_actor_exploration==True): self.update_actor_exploration() episode_step += 1 self.step += 1 if done1: # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) # print("Episode: {}, Rewards: {}, Steps: {}".format(episode,episode_logs['episode_reward'],episode_logs['nb_episode_steps'])) episode += 1 ## CHECK! observation1 = None episode_step = None episode_reward1 = None if done2: observation2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True self.nb_steps = nb_steps callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({ 'nb_steps': nb_steps, }) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False t = Thread(target=self.backward, args=[0, False]) t.start() try: # maker sure forwad and backward are in the same graph with self.sess.graph.as_default(): #while self.step < nb_steps: while self.back_step < nb_steps: if self.backward_start_flag == True: print "start" continue if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) print "observation shape: ", observatoin.shape break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). K.manual_variable_initialization(True) action = self.forward(observation) #print "forward step show: ", self.step #print "forward weights: ", self.sim_forward_actor.get_weights()[0] #time.sleep(0.01) K.manual_variable_initialization(False) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # Store most recent experience in memory. if self.step % self.memory_interval == 0: self.memory.append(self.recent_observation, self.recent_action, reward, done, training=self.training) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': self.metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) #self.backward(0., terminal=False) if self.step % self.memory_interval == 0: self.memory.append(self.recent_observation, self.recent_action, 0, False, training=self.training) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.agent1.training = True self.agent2.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() observation = deepcopy(env.reset()) if self.agent1.processor is not None: # not individual for now observation = self.agent1.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.agent1.processor is not None: # not individual for now. action is not from agent anyway action = self.agent1.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.agent1.processor is not None: observation, reward, done, info = self.agent1.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. ' 'You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observation = deepcopy(env.reset()) if self.agent1.processor is not None: observation = self.agent1.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = self.agent1.forward(observation) action2 = self.agent2.forward(observation) if self.agent1.processor is not None: action1 = self.agent1.processor.process_action(action1) if self.agent2.processor is not None: action2 = self.agent2.processor.process_action(action2) action = (np.ndarray.item(action1), np.ndarray.item(action2)) reward1 = np.float32(0) reward2 = np.float32(0) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) # Use only one of the actions? added actions? observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.agent1.processor is not None: observation, r, done, info = self.agent1.processor.process_step(observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] reward += info["r1"] + info["r2"] if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation) self.agent2.forward(observation) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) assert self.processor is None # Removed processors here for simplification. Not needed anyway assert nb_max_start_steps == 0 # Removed here for simplification. Not needed anyway assert action_repetition == 1 # Removed here for simplification. Not needed anyway self.agent1.training = True self.agent2.training = True experience_for_plotting = deque() callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation1 = observation2 = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation1 is None or observation2 is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() obs = env.reset() observation1 = deepcopy(obs) + (0.,) observation2 = deepcopy(obs) + (0.,) # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation1 is not None assert observation2 is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = np.ndarray.item(self.agent1.forward(observation1)) action2 = np.ndarray.item(self.agent2.forward(observation2)) action = (action1, action2) reward1 = np.float32(0) reward2 = np.float32(0) accumulated_info = {} done = False callbacks.on_action_begin(action) # Use only one of the actions? added actions? obs, r, done, info = env.step(action) if done: raise AttributeError # The episode was reset unexpectedly # (see https://stackoverflow.com/questions/42787924/) observation1 = deepcopy(obs) + (info["u2_clipped"],) # Add action other to the observation observation2 = deepcopy(obs) + (info["u1_clipped"],) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action[0] + action[1], 'observation': observation1, 'reward': reward1 + reward2, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if len(obs) == 2: experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.), r, (info["r1"], info["r2"]))) if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation1) self.agent2.forward(observation2) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation1 = None observation2 = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return experience_for_plotting
def train(self, nb_steps=30000, verbose=1, visualize=False, log_interval=3000): if self.__istrained: raise RuntimeError('このモデルは既に訓練済みです。') print('訓練を行うので、お待ちください。') # 訓練実施 # Okay, now it's time to learn something! # We visualize the training here for show, but this slows down training quite a lot. # You can always safely abort the training prematurely using Ctrl + C. callbacks = [] if verbose == 1: self.train_interval_logger = TrainIntervalLogger2( interval=log_interval) callbacks.append(self.train_interval_logger) verbose = 0 elif verbose > 1: callbacks.append(TrainEpisodeLogger()) verbose = 0 hist = self.dqn.fit(self.env, nb_steps=nb_steps, callbacks=callbacks, verbose=verbose, visualize=visualize, log_interval=log_interval) self.__istrained = True if self.train_interval_logger is not None: # 訓練状況の可視化 interval = self.train_interval_logger.records['interval'] episode_reward = self.train_interval_logger.records[ 'episode_reward'] mean_q = self.train_interval_logger.records['mean_q'] if len(interval) > len(mean_q): mean_q = np.pad(mean_q, [len(interval) - len(mean_q), 0], "constant") plt.figure() plt.plot(interval, episode_reward, marker='.', label='報酬') plt.plot(interval, mean_q, marker='.', label='Q値') plt.legend(loc='best', fontsize=10) plt.grid() plt.xlabel('interval') plt.ylabel('score') plt.title('訓練状況') plt.xticks( np.arange(min(interval), max(interval) + 1, (max(interval) - min(interval)) // 7)) plt.show() # 重みの保存 if not exists(self.__class__.weightdir): try: mkdir(self.__class__.weightdir) except: print('重み保存フォルダの作成中にエラーが発生しました。') print('Unexpected error:', exc_info()[0]) raise try: # After training is done, we save the final weights. self.dqn.save_weights(self.weightfile, overwrite=True) except: print('重みの保存中にエラーが発生しました。') print('Unexpected error:', exc_info()[0]) raise return hist
def fit_hrl(self, env, nb_steps, random_start_step_policy, callbacks=None, verbose=1, visualize=False, pre_warm_steps=0, log_interval=100, save_interval=1, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been' ' compiled yet. Please call `compile()` before `fit()`.') self.training = True self.turn_left_agent.training = True self.go_straight_agent.training = True self.turn_right_agent.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] parent_dir = os.path.dirname(os.path.dirname(__file__)) callbacks += [FileLogger(filepath=parent_dir + os.sep + 'log.json')] callbacks += [ ModelIntervalCheckpoint(filepath=parent_dir + '/checkpoints/model_step{step}.h5f', interval=save_interval, verbose=1) ] history = History() callbacks += [history] callbacks = CallbackList(callbacks) callbacks.set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } callbacks.set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) self.turn_left_agent.step = np.int16(0) self.go_straight_agent.step = np.int16(0) self.turn_right_agent.step = np.int16(0) observation = env.encoded_obs episode_reward = None episode_step = None did_abort = False # warm steps print('pre warming up:') for _ in range(pre_warm_steps): normed_action = random_start_step_policy() recent_action = normed_action recent_observation = observation # put in normed action and unprocessed observation action = self.processor.process_action( recent_action) # [0/1/2, goal_delta_x, acc] callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) self.memory.append(recent_observation, recent_action[0], reward, done, training=self.training) if recent_action[0] == 0: left_obs = np.column_stack( (recent_observation[:, :30], recent_observation[:, -8:], np.tile( np.array([1, 0, 0]), (recent_observation.shape[0], 1)))) # 30 + 8 + 3 = 41 lower_action = recent_action[1:] self.turn_left_agent.memory.append(left_obs, lower_action, reward, 1, training=self.training) elif recent_action[0] == 1: straight_obs = np.column_stack( (deepcopy(recent_observation), np.tile(np.array([0, 1, 0]), (recent_observation.shape[0], 1)))) # 56 + 3 = 59 lower_action = recent_action[1:] self.go_straight_agent.memory.append(straight_obs, lower_action, reward, 1, training=self.training) else: right_obs = np.column_stack( (recent_observation[:, 18:], np.tile( np.array([0, 0, 1]), (recent_observation.shape[0], 1)))) # 56- 18 + 3 = 41 lower_action = recent_action[1:] self.turn_right_agent.memory.append(right_obs, lower_action, reward, 1, training=self.training) print('————————————————————————————————————————') print({ 'upper_memory_len: ': self.memory.nb_entries, 'left_memory_len: ': self.turn_left_agent.memory.nb_entries, 'straight_memory_len: ': self.go_straight_agent.memory.nb_entries, 'right_memory_len: ': self.turn_right_agent.memory.nb_entries }) print('————————————————————————————————————————') # TODO: always has a point is not done, but there would be only one bad point in the buffer if done: def random_init_state(flag=True): init_state = [-800, -150 - 3.75 * 5 / 2, 5, 0] if flag: x = np.random.random() * 1000 - 800 lane = np.random.choice([0, 1, 2, 3]) y_fn = lambda lane: \ [-150 - 3.75 * 7 / 2, -150 - 3.75 * 5 / 2, -150 - 3.75 * 3 / 2, -150 - 3.75 * 1 / 2][lane] y = y_fn(lane) v = np.random.random() * 25 heading = 0 init_state = [x, y, v, heading] return init_state observation = deepcopy( env.reset(init_state=random_init_state(flag=True))) if self.processor is not None: observation = self.processor.process_observation( observation) observation = None try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() def random_init_state(flag=True): init_state = [-800, -150 - 3.75 * 5 / 2, 5, 0] if flag: x = np.random.uniform(0, 1) * 1000 - 800 lane = np.random.choice([0, 1, 2, 3]) y_fn = lambda lane: [ -150 - 3.75 * 7 / 2, -150 - 3.75 * 5 / 2, -150 - 3.75 * 3 / 2, -150 - 3.75 * 1 / 2 ][lane] y = y_fn(lane) v = np.random.uniform(0, 1) * 25 heading = 0 init_state = [x, y, v, heading] return init_state observation = deepcopy( env.reset(init_state=random_init_state())) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) # this is normed action action = self.processor.process_action( action) # this is processed action for env done = False callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, # processed action 'observation': observation, # true obs 'reward': reward, 'metrics': metrics, 'episode': episode # 'info': info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 self.turn_left_agent.step += 1 self.go_straight_agent.step += 1 self.turn_right_agent.step += 1 memory_len = [ self.turn_left_agent.memory.nb_entries, self.go_straight_agent.memory.nb_entries, self.turn_right_agent.memory.nb_entries ] if done: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'memory_len': memory_len } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, agt1, agt2, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, save_interval=5000, nb_max_episode_steps=None): agt1.training = True agt2.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = keras.callbacks.History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(agt1) callbacks.set_model(agt2) else: callbacks._set_model(agt1) callbacks._set_model(agt2) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) agt1._on_train_begin() agt2._on_train_begin() callbacks.on_train_begin() np = numpy episode = np.int16(0) agt1.step = np.int16(0) agt2.step = np.int16(0) observation = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while agt1.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) agt1.reset_states() agt2.reset_states() observation = copy.deepcopy(env.reset()) assert observation is not None if nb_max_start_steps == 0: nb_random_start_steps = 0 else: nms = nb_max_start_steps nb_random_start_steps = np.random.randint(nms) for _ in range(nb_random_start_steps): if start_step_policy is None: action1 = env.action_space.sample() action2 = env.action_space.sample() else: action1 = start_step_policy(observation) action2 = start_step_policy(observation) callbacks.on_action_begin(action1) observation, reward, done, info = env.step( action1, action2) observation = copy.deepcopy(observation) callbacks.on_action_end(action1) if done: observation = copy.deepcopy(env.reset()) break # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation is not None callbacks.on_step_begin(episode_step) action1 = agt1.forward(observation) action2 = agt2.forward(observation) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action1) observation, r, done, info = env.step(action1, action2) observation = copy.deepcopy(observation) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action1) reward += r if done: break if nb_max_episode_steps: if episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = agt1.backward(reward, terminal=done) metrics2 = agt2.backward(-reward, terminal=done) episode_reward1 += reward episode_reward2 -= reward step_logs = { 'action1': action1, 'action2': action2, 'observation': observation, 'reward': reward, 'metrics': [metrics1[i] + metrics2[i] for i in range(len(metrics1))], 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 agt1.step += 1 agt2.step += 1 if agt1.step % save_interval == 0: self.smod1.save_weights(MODEL_SAVE_PATH % 1) self.smod2.save_weights(MODEL_SAVE_PATH % 2) if done: agt1.forward(observation) agt2.forward(observation) agt1.backward(0., terminal=False) agt2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1, 'episode_reward1': episode_reward1, 'episode_reward2': episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': agt1.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) agt1._on_train_end() agt2._on_train_end() return history
env = halite_env.Env() env.configure(socket_path=f"/dev/shm/{time.time_ns()}", replay=False, bot_name=bot_name) env = MetricsEnv(env) env = CommandEnv(env) nb_actions = env.action_space.n model = LordTateKanti.make_model(env) model.summary() # parameters nb_steps = 20_000 nb_steps_warmup = int(nb_steps * 0.01) memory = SequentialMemory(limit=10_000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=0.01, value_test=0.05, nb_steps=int(nb_steps * 0.66)) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, gamma=0.7) agent.compile(Adam(lr=1e-3), metrics=['mae']) callbacks = [ ModelIntervalCheckpoint('dqn_PlanetCaptureBot_weights_{step}.h5f', interval=100000), TrainEpisodeLogger(), TensorBoard() ] agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1, callbacks=callbacks) agent.save_weights('dqn_PlanetCaptureBot_weights_final.h5f', overwrite=True) # agent.test(env, nb_episodes=1, visualize=False)
def fit(self, env, nb_steps, action_repetition=1, callbacks=[], verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] callbacks = CallbackList(callbacks) callbacks._set_model(self) callbacks._set_env(env) callbacks._set_params({ 'nb_steps': nb_steps, }) callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = env.reset() assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) callbacks.on_action_begin(action) observation, _, done, _ = env.step(action) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = env.reset() break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) reward = 0. done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, _ = env.step(action) callbacks.on_action_end(action) reward += r if done: break metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done or (nb_max_episode_steps and episode_step > nb_max_episode_steps): # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort})
def _run(self, env, nb_steps=None, nb_episodes=None, train=True, exploration=True, action_repetition=1, callbacks=None, verbose=1, render=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, reward_scaling=1., plots=False, tensorboard=False, **kwargs): """ Run steps until termination. This method shouldn't be called directly, but instead called in :func:`fit` and :func:`test` Termination can be either: * Maximal number of steps * Maximal number of episodes :param nb_steps: Number of steps before termination. :param nb_episodes: Number of episodes before termination. :param bool training: Whether to train or test the agent. Not available for the :func:`fit` and :func:`test` methods. :param int action_repetition: Number of times the action is repeated for each step. :param callbacks: :param int verbose: 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging :param bool visualize: Render the environment in realtime. This slows down by a big factor (up to 100) the function. :param nb_max_start_steps: :param start_step_policy: (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. :param log_interval: :param reward_scaling: :param plots: Plot metrics during training. :param tensorboard: Export metrics to tensorboard. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `train()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) # Process the different cases when either nb_steps or nb_episodes are specified if (nb_steps is None and nb_episodes is None): raise (ValueError( "Please specify one (and only one) of nb_steps or nb_episodes") ) elif (nb_steps is not None and nb_episodes is None): termination_criterion = STEPS_TERMINATION elif (nb_steps is None and nb_episodes is not None): termination_criterion = EPISODES_TERMINATION elif (nb_steps is not None and nb_episodes is not None): raise (ValueError( "Please specify one (and only one) of nb_steps or nb_episodes") ) self.training = train # We explore only if the flag is selected and we are in train mode self.exploration = (train and exploration) # Initialize callbacks if callbacks is None: callbacks = [] if self.training: if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] else: if verbose >= 1: callbacks += [TestLogger()] callbacks = [] if not callbacks else callbacks[:] if render: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) if termination_criterion == STEPS_TERMINATION: params = { 'nb_steps': nb_steps, } elif termination_criterion == EPISODES_TERMINATION: params = { 'nb_episodes': nb_episodes, 'nb_steps': 1, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) # Add run hooks if tensorboard: from rl.hooks.tensorboard import TensorboardHook self.hooks.append(TensorboardHook(agent_id=self.id)) if plots: from rl.hooks.plot import PortraitHook, TrajectoryHook self.hooks.append(PortraitHook(agent_id=self.id)) self.hooks.append(TrajectoryHook(agent_id=self.id)) # Define the termination criterion # Step and episode at which we satrt the function start_step = self.step start_episode = self.episode if termination_criterion == STEPS_TERMINATION: def termination(): return (self.step - start_step >= nb_steps) elif termination_criterion == EPISODES_TERMINATION: def termination(): return ((self.episode - start_episode >= nb_episodes and self.done)) if self.training: self._on_train_begin() else: self._on_test_begin() callbacks.on_train_begin() # Setup self.run_number += 1 self.run_done = False self.done = True did_abort = False # Define these for clarification, not mandatory: # Where observation: Observation before the step # observation_1: Observation after the step self.observation = None self.observation_1 = None self.action = None self.step_summaries = None # Run_init hooks self.hooks.run_init() # Run steps (and episodes) until the termination criterion is met while not (self.run_done): # Init episode # If we are at the beginning of a new episode, execute a startup sequence if self.done: self.episode += 1 if self.training: self.training_episode += 1 self.episode_reward = 0. self.episode_step = 0 callbacks.on_episode_begin(self.episode) # Obtain the initial observation by resetting the environment. self.reset_states() observation_0 = deepcopy(env.reset()) assert observation_0 is not None # Perform random steps at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. if nb_max_start_steps != 0: observation_0 = self._perform_random_steps( nb_max_start_steps, start_step_policy, env, observation_0, callbacks) else: # We are in the middle of an episode # Update the observation observation_0 = self.observation_1 # Increment the episode step # FIXME: Use only one of the two variables self.observation = observation_0 # Increment the current step in both cases self.step += 1 if self.training: self.training_step += 1 self.episode_step += 1 self.reward = 0. self.step_summaries = [] accumulated_info = {} # Run a single step. callbacks.on_step_begin(self.episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # state_0 -- (foward) --> action self.action = self.forward(self.observation) # action -- (step) --> (reward, state_1, terminal) # Apply the action # With repetition, if necesarry for _ in range(action_repetition): callbacks.on_action_begin(self.action) self.observation_1, r, self.done, info = env.step(self.action) # observation_1 = deepcopy(observation_1) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(self.action) self.reward += r # Set episode as finished if the environment has terminated if self.done: break # Scale the reward self.reward = self.reward * reward_scaling self.episode_reward += self.reward # End of the step # Stop episode if reached the step limit if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps: # Force a terminal state. self.done = True # Post step: training, callbacks and hooks # Train the algorithm self.backward() # step_end Hooks self.hooks() # Callbacks # Collect statistics step_logs = { 'action': self.action, 'observation': self.observation_1, 'reward': self.reward, # For legacy callbacks upport 'metrics': [], 'episode': self.episode, 'info': accumulated_info, } callbacks.on_step_end(self.episode_step, step_logs) # Episodic callbacks if self.done: # Collect statistics episode_logs = { 'episode_reward': np.float_(self.episode_reward), 'nb_episode_steps': np.float_(self.episode_step), 'nb_steps': np.float_(self.step), } callbacks.on_episode_end(self.episode, logs=episode_logs) self.hooks.episode_end() # Stop run if termination criterion met if termination(): self.run_done = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() self.hooks.run_end() return (history)
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, version=None, custom_env=False): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True #self.stop_training = False # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() # get the history class callbacks += [history] # Assign history to callback callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None #self.episode_step = None # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 did_abort = False # open workbook to store result workbook = xlwt.Workbook() sheet = workbook.add_sheet('DQN') sheet_step = workbook.add_sheet('step') try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) #self.episode_step = np.int16(0) # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None #assert self.episode_step is not None # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) #callbacks.on_step_begin(callbacks.on_step_begin(self.episode_step)) # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) # print(observation, r, done, info) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: #if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True if (custom_env): metrics = self.backward(reward[0], terminal=done) # tran's version else: metrics = self.backward( reward, terminal=done) # for testing with dqn_cartpole episode_reward += reward if (custom_env): step_logs = { 'action': action, 'observation': observation, 'reward': reward[0], # tran's version 'metrics': metrics, 'episode': episode, 'info': accumulated_info, 'throughput': reward[1], } else: step_logs = { 'action': action, 'observation': observation, 'reward': reward, # for testing with dqn_cartpole 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 #callbacks.on_step_end(self.episode_step, step_logs) # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 #self.episode_step += 1 # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. if (custom_env): episode_logs = { 'episode_reward': episode_reward[0], # Only return the first value 'throughput': episode_reward[1], #'nb_episode_steps': episode_step, #'nb_steps': self.step, #'loss': history['loss'], } else: episode_logs = { 'episode_reward': episode_reward, # seems to return an array 'nb_episode_steps': episode_step, 'nb_steps': self.step, } print("Episode Number: ", episode) print("Episode Rewards: ", episode_reward) #print("Episode Logs", episode_logs) #print("Episode metrics", metrics) print(history.history.keys()) # print("History Loss", hist.history['loss']) # print("History Loss", hist.history['acc']) # print("History Loss", hist.history['val_loss']) # print("History Loss", hist.history['val_acc']) callbacks.on_episode_end(episode, episode_logs) #print("Episode Reward size is: ", len(episode_reward)) #print("Reward array size is: ", episode_reward) sheet.write(episode + 1, 0, str(episode)) sheet.write(episode + 1, 1, str(episode_reward[0])) sheet.write(episode + 1, 2, str(episode_reward[1])) #sheet.write(episode + 1, 3, str(episode_reward[2])) # for 2 #sheet.write(episode + 1, 4, str(episode_reward[3])) # for 3 #sheet.write(episode + 1, 5, str(episode_reward[4])) # for 4 episode += 1 observation = None #episode_step = None self.episode_step = None # https://github.com/keras-rl/keras-rl/pull/170/commits/73c073d773ade1ccf70bdcd8f96237473b206ed8 episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() file_name = 'result_v' + version + '.xls' # if (self.enable_double_dqn): # file_name = 'DDQN_' + file_name # if (self.enable_dueling_network): # file_name = 'Dueling_' + file_name workbook.save('../results/' + file_name) return history
gamma=.99, target_model_update=10000, train_interval=1, delta_clip=1., pretraining_steps=15000, n_step=10, large_margin=.8, lam_2=1) lr = .00025 dqfd.compile(Adam(lr), metrics=['mae']) weights_filename = model_saves + filename_append + "_" + datestr + "_" + 'student_' + environment_name + '15k_weights.h5f' checkpoint_weights_filename = model_saves + filename_append + "_" + datestr + "_" + 'student_' + environment_name + '15k_weights{step}.h5f' log_filename = model_saves + filename_append + "_" + datestr + "_" + 'student_' + environment_name + '15k_REWARD_DATA.txt' callbacks = [ TrainEpisodeLogger(log_filename), ModelIntervalCheckpoint(checkpoint_weights_filename, interval=1000000) ] if args.mode == 'train': dqfd.fit(env, callbacks=callbacks, nb_steps=4250000, verbose=0, nb_max_episode_steps=1500) dqfd.save_weights(weights_filename, overwrite=True) if args.mode == 'test': dqfd.load_weights(model_saves + 'student_' + environment_name + '15k_weights.h5f') dqfd.test(env, nb_episodes=12,
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, starting_checkpoints=[], avarage_q=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. starting_checkpoints ([string]): starting checkpoints file names. When the enviroment is reset one checkpoint from the list will be drawn at random and enviroment will start from that exact checkpoint. You can create the checkpoints using interactive_env.py. nb_max_episode_steps (dictionary): provide the options in order to messure avarage Q after the end of each episode. The metric will be added to the log as described at Playing Atari with Deep Reinforcement Learning. The start of the training may be delay as it takes some time to choose the evaluationg states. You can either provide the two following options or a True boolean for using the defaults: n_evaluations (integer): number of checkpoints to be evaluated and avaraged (default: 10). bernoulli (float): bernoulli parameter. If succeed, the step will be chosen as a checkpoint. The smaller this number the longer will take to select the checkpoints (default: 0.1). # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False episode_beginning = True try: self.collect_avarage_q_checkpoints(env, avarage_q, starting_checkpoints) while self.step < nb_steps: if observation is None: # start of a new episode episode_beginning = True callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() if starting_checkpoints: checkpoint = np.random.choice(starting_checkpoints) observation = deepcopy( env.reset(checkpoint='checkpoints/{}'.format( checkpoint))) else: observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in xrange(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False # NOTA-EZE: Esto agrega complejidad al pe*o. El frameskip lo implementamos en el emulador for _ in xrange(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) # for key, value in info.items(): # if not np.isreal(value): # continue # if key not in accumulated_info: # accumulated_info[key] = np.zeros_like(value) # accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True # if self.memory.__class__.__name__ == 'PrioritizedMemory': # self.memory.append_with_error(observation, action, reward, done, episode_beginning) metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) # if self.memory.__class__.__name__ == 'PrioritizedMemory': # self.memory.append_with_error(observation) # if self.memory.__class__.__name__ == 'EfficientPriorizatedMemory': # self.memory.append(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'global_score': info["global_score"] } if self.memory.is_prioritized(): episode_logs['max_error_PER'] = self.memory.maximum episode_logs['average_error_PER'] = self.memory.average self.memory.reset_metrics() if starting_checkpoints: episode_logs['checkpoint'] = checkpoint callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025 / 4), metrics=['mae']) folder_path = './' mode = 'train' if mode == 'train': weights_filename = folder_path + 'pdd_dqn_{}_weights.h5f'.format(env_name) checkpoint_weights_filename = folder_path + 'pdd_dqn_' + env_name + '_weights_{step}.h5f' log_filename = folder_path + 'pdd_dqn_' + env_name + '_REWARD_DATA.txt' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=500000) ] callbacks += [TrainEpisodeLogger()] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, verbose=0, nb_max_episode_steps=20000) elif mode == 'test': weights_filename = folder_path + 'pdd_dqn_MsPacmanDeterministic-v4_weights_10000000.h5f' dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True, nb_max_start_steps=80)
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, useShaping=False, learnAMDP=False, stateToBucket=None, vae=None, shapingFunction=None, nb_max_episode_steps=None, projectionModel=None, episodeToBegin=0, stepToBegin=0, extraWarmup=0, doTraining=True): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ fittingMode = "" if not useShaping and not learnAMDP: fittingMode = "NoShaping" elif learnAMDP and not useShaping: fittingMode = "learnAMDP" elif learnAMDP and useShaping: fittingMode = "learnAndUseAMDP" elif useShaping and not shapingFunction is None and projectionModel is None: fittingMode = "useShapingFunction" elif useShaping and not projectionModel is None and shapingFunction is None: fittingMode = "useProjectionModel" else: raise Exception("Invalid Combination of Options") print("Fitting Mode Is:") print(fittingMode) if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.useShaping = useShaping self.training = doTraining self.stateToBucket = stateToBucket if not projectionModel is None: self.projectionModel = projectionModel[0] self.projectionGraph = projectionModel[1] self.projectionSession = projectionModel[2] if not shapingFunction is None: self.shapingModel = shapingFunction[0] self.shapingGraph = shapingFunction[1] self.shapingSession = shapingFunction[2] sess = vae[0] vaeNetwork = vae[1] self.printVae = False self.extraWarmup = extraWarmup callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() self.stepToBegin = stepToBegin self.episode = episodeToBegin self.step = stepToBegin self.neg_reward_counter = np.int16(0) self.max_neg_rewards = np.int16(12) observation = None previousObservation = None episode_reward = None episode_step = None did_abort = False if fittingMode in ["learnAMDP", "learnAndUseAMDP"]: self.amdp = deepAMDP(numberOfActions=env.action_space.n) latentStatesVisited = [] episodeStateHistory = [] episodeColourStateHistory = [] try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(self.episode) previousObservation = None episode_step = np.int16(0) episode_reward = np.float32(0) self.accumulatedExtrinsicReward = 0 self.accumulatedReward = 0 self.accumulatedSteps = 0 episodeStateHistory = [] episodeColourStateHistory = [] # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) colourObservation = observation if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None episodeStateHistory.append(observation) episodeColourStateHistory.append(colourObservation) # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) colourObservation = observation if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) episodeStateHistory.append(observation) episodeColourStateHistory.append(colourObservation) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) colourObservation = observation if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False self.accumulatedExtrinsicReward = 0 ### #print(action_repetition) for _ in range(action_repetition): callbacks.on_action_begin(action) previousObservation = observation previousColourObservation = colourObservation observation, r, done, info = env.step(action) if self.printVae: sess = vae[0] vaeNetwork = vae[1] #print(vae.encoder(tf.image.resize_images(observation.reshape(1,96,96,3), [64, 64]))) obs = sess.run(vaeNetwork.z, feed_dict={ vaeNetwork.image: observation[None, :, :, :] }) #print(obs) latentStatesVisited.append(obs) self.accumulatedReward += r self.accumulatedSteps += 1 colourObservation = observation #self.colourMemory.append(colourObservation,0,0,0) #print(observation.shape) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if fittingMode in [ "useProjectionModel", "useShapingFunction", "learnAMDP" ]: if fittingMode in ["useProjectionModel"]: if len(episodeStateHistory) < 4: if len(episodeStateHistory) == 0: stackedObservations = np.array([ np.zeros(observation.shape), np.zeros(observation.shape), np.zeros(observation.shape), observation ]) previousStackedObservations = np.array([ np.zeros(observation.shape), np.zeros(observation.shape), np.zeros(observation.shape), np.zeros(observation.shape) ]) elif len(episodeStateHistory) == 1: stackedObservations = np.array([ np.zeros(observation.shape), np.zeros(observation.shape), episodeStateHistory[-1], observation ]) previousStackedObservations = np.array([ np.zeros(observation.shape), np.zeros(observation.shape), np.zeros(observation.shape), episodeStateHistory[-1] ]) elif len(episodeStateHistory) == 2: stackedObservations = np.array([ np.zeros(observation.shape), episodeStateHistory[-2], episodeStateHistory[-1], observation ]) previousStackedObservations = np.array([ np.zeros(observation.shape), np.zeros(observation.shape), episodeStateHistory[-2], episodeStateHistory[-1] ]) elif len(episodeStateHistory) == 3: stackedObservations = np.array([ episodeStateHistory[-3], episodeStateHistory[-2], episodeStateHistory[-1], observation ]) previousStackedObservations = np.array([ np.zeros(observation.shape), episodeStateHistory[-3], episodeStateHistory[-2], episodeStateHistory[-1] ]) else: stackedObservations = np.array([ episodeStateHistory[-3], episodeStateHistory[-2], episodeStateHistory[-1], observation ]) previousStackedObservations = np.array([ episodeStateHistory[-4], episodeStateHistory[-3], episodeStateHistory[-2], episodeStateHistory[-1] ]) with self.projectionGraph.as_default(): with self.projectionSession.as_default(): potentialCurrentState = max( self.projectionModel.predict( np.array([stackedObservations]))[0]) potentialPreviousState = max( self.projectionModel.predict( np.array([previousStackedObservations ]))[0]) discountedDifference = self.gamma * potentialCurrentState - potentialPreviousState #print(discountedDifference) elif fittingMode in ["useShapingFunction", "learnAMDP"]: if len(episodeColourStateHistory) < 4: if len(episodeColourStateHistory) == 0: stackedObservations = np.array([ np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), colourObservation ]) previousStackedObservations = np.array([ np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), np.zeros(colourObservation.shape) ]) elif len(episodeColourStateHistory) == 1: stackedObservations = np.array([ np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), episodeColourStateHistory[-1], colourObservation ]) previousStackedObservations = np.array([ np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), episodeColourStateHistory[-1] ]) elif len(episodeColourStateHistory) == 2: stackedObservations = np.array([ np.zeros(colourObservation.shape), episodeColourStateHistory[-2], episodeColourStateHistory[-1], colourObservation ]) previousStackedObservations = np.array([ np.zeros(colourObservation.shape), np.zeros(colourObservation.shape), episodeColourStateHistory[-2], episodeColourStateHistory[-1] ]) elif len(episodeColourStateHistory) == 3: stackedObservations = np.array([ episodeColourStateHistory[-3], episodeColourStateHistory[-2], episodeColourStateHistory[-1], colourObservation ]) previousStackedObservations = np.array([ np.zeros(colourObservation.shape), episodeColourStateHistory[-3], episodeColourStateHistory[-2], episodeColourStateHistory[-1] ]) else: stackedObservations = np.array([ episodeColourStateHistory[-3], episodeColourStateHistory[-2], episodeColourStateHistory[-1], colourObservation ]) previousStackedObservations = np.array([ episodeColourStateHistory[-4], episodeColourStateHistory[-3], episodeColourStateHistory[-2], episodeColourStateHistory[-1] ]) latentCurrentState = [ sess.run(vaeNetwork.z, feed_dict={ vaeNetwork.image: obs[None, :, :, :] }).tolist()[0] for obs in stackedObservations ] latentPreviousState = [ sess.run(vaeNetwork.z, feed_dict={ vaeNetwork.image: obs[None, :, :, :] }).tolist()[0] for obs in previousStackedObservations ] #latentPreviousState = list(chain.from_iterable(latentPreviousState)) if fittingMode in ["useShapingFunction"]: with self.shapingGraph.as_default(): with self.shapingSession.as_default(): #print(np.array(latentCurrentState).shape) latentCurrentState = np.array( latentCurrentState) latentPreviousState = np.array( latentPreviousState) latentCurrentState = latentCurrentState.reshape( (-1, 4, 32)) latentPreviousState = latentPreviousState.reshape( (-1, 4, 32)) #print(np.array(latentCurrentState).shape) potentialCurrentLatentState = max( self.shapingModel.predict( latentCurrentState)[0]) potentialPreviousLatentState = max( self.shapingModel.predict( latentPreviousState)[0]) #print(potentialCurrentLatentState, potentialPreviousLatentState) discountedDifference = self.gamma * potentialCurrentLatentState - potentialPreviousLatentState #discountedDifference = np.clip(discountedDifference, -10000, 10000) #print(discountedDifference) if fittingMode in ["learnAMDP"]: #print(latentCurrentState) # print(np.array(latentCurrentState).shape) self.amdp.addExperience( np.array(latentCurrentState), action, reward, done) # discountedDifference = self.gamma*potentialCurrentState-potentialPreviousState discountedDifference = 0 self.accumulatedExtrinsicReward = discountedDifference #print(self.accumulatedExtrinsicReward) early_done, punishment = self.check_early_stop( reward, episode_reward) if early_done: reward += punishment done = done or early_done if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True #if not currentAbstractState == previousAbstractState: #print(self.accumulatedExtrinsicReward) episodeStateHistory.append(observation) episodeColourStateHistory.append(colourObservation) if fittingMode in [ "learnAndUseAMDP", "useShapingFunction", "useProjectionModel" ]: #print(omega*self.accumulatedExtrinsicReward) #print(self.accumulatedExtrinsicReward) #print(self.accumulatedExtrinsicReward) metrics = self.backward( reward, reward + self.currentOmega * self.accumulatedExtrinsicReward, terminal=done) elif fittingMode in ["learnAMDP"]: metrics = self.backward(reward, reward, terminal=done) if self.step > self.nb_steps_warmup: self.amdp.replay() else: metrics = self.backward(reward, reward, terminal=done) # episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': self.episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., 0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(self.episode, episode_logs) self.episode += 1 if self.omegaStart > 0: self.currentOmega = max( self.omegaStart + (self.episode / self.omegaEpisodes) * (self.omegaEnd - self.omegaStart), self.omegaEnd) #if episode > 500: # self.currentOmega = 0 # self.omegaStart = 0 # self.omegaEnd = 0 #print(self.currentOmega) observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() with open('latentVisited2.pickle', 'wb') as handle: pickle.dump(latentStatesVisited, handle, protocol=pickle.HIGHEST_PROTOCOL) return history
def _run(self, env, nb_steps=None, nb_episodes=None, training=True, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, reward_scaling=1.): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. nb_episodes (integer): Number of episodes to perform training (boolean): Whether to train or test the agent action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. reward_scaling (float): The amount with which the reward will be scaled # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) # Process the different cases when either nb_steps or nb_episodes are specified if (nb_steps is None and nb_episodes is None): raise (ValueError( "Please specify one (and only one) of nb_steps and nb_episodes" )) elif (nb_steps is not None and nb_episodes is None): termination_criterion = STEPS_TERMINATION elif (nb_steps is None and nb_episodes is not None): termination_criterion = EPISODES_TERMINATION elif (nb_steps is not None and nb_episodes is not None): raise (ValueError( "Please specify one (and only one) of nb_steps and nb_episodes" )) self.training = training # Initialize callbacks if callbacks is None: callbacks = [] if self.training: if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] else: if verbose >= 1: callbacks += [TestLogger()] callbacks = [] if not callbacks else callbacks[:] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) if termination_criterion == STEPS_TERMINATION: params = { 'nb_steps': nb_steps, } elif termination_criterion == EPISODES_TERMINATION: params = { 'nb_episodes': nb_episodes, 'nb_steps': 1, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) # Initialize the Hooks hooks = Hooks(self, [TensorboardHook(), PortraitHook(), TrajectoryHook()]) # Define the termination criterion # Step and episode at which we satrt the function start_step = self.step start_episode = self.episode if termination_criterion == STEPS_TERMINATION: def termination(): return (self.step - start_step > nb_steps) elif termination_criterion == EPISODES_TERMINATION: def termination(): return (self.episode - start_episode > nb_episodes) if self.training: self._on_train_begin() else: self._on_test_begin() callbacks.on_train_begin() # Setup self.done = True did_abort = False # Define these for clarification, not mandatory: # Where observation_0: Observation before the step # observation_1: Observation after the step observation_0 = None observation_1 = None self.step_summaries = None try: # Run steps (and episodes) until the termination criterion is met while not (termination()): # Init episode # If we are at the beginning of a new episode, execute a startup sequence if self.done: self.episode += 1 self.episode_reward = 0. self.episode_step = 0 callbacks.on_episode_begin(self.episode) # Obtain the initial observation by resetting the environment. self.reset_states() observation_0 = deepcopy(env.reset()) assert observation_0 is not None # Perform random steps at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. if nb_max_start_steps != 0: observation_0 = self._perform_random_steps( nb_max_start_steps, start_step_policy, env, observation_0, callbacks) else: # We are in the middle of an episode # Update the observation observation_0 = observation_1 # Increment the episode step # FIXME: Use only one of the two variables self.observation = observation_0 # Increment the current step in both cases self.step += 1 self.episode_step += 1 self.reward = 0. accumulated_info = {} # Run a single step. callbacks.on_step_begin(self.episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). # state_0 -- (foward) --> action action = self.forward(observation_0) # Process the action action = self.processor.process_action(action) # action -- (step) --> (reward, state_1, terminal) # Apply the action # With repetition, if necesarry for _ in range(action_repetition): callbacks.on_action_begin(action) observation_1, r, self.done, info = env.step(action) # observation_1 = deepcopy(observation_1) observation_1, r, self.done, info = self.processor.process_step( observation_1, r, self.done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) self.reward += r # Set episode as finished if the environment has terminated if self.done: break # Scale the reward self.reward = self.reward * reward_scaling self.episode_reward += self.reward # End of the step # Stop episode if reached the step limit if nb_max_episode_steps and self.episode_step >= nb_max_episode_steps: # Force a terminal state. self.done = True # Post step: training, callbacks and hooks # Train the algorithm metrics, self.step_summaries = self.backward( observation_0, action, self.reward, observation_1, terminal=self.done) # Hooks hooks() # Callbacks # Collect statistics step_logs = { 'action': action, 'observation': observation_1, 'reward': self.reward, 'metrics': metrics, 'episode': self.episode, 'info': accumulated_info, } callbacks.on_step_end(self.episode_step, step_logs) # Episodic callbacks if self.done: # Collect statistics episode_logs = { 'episode_reward': np.float_(self.episode_reward), 'nb_episode_steps': np.float_(self.episode_step), 'nb_steps': np.float_(self.step), } callbacks.on_episode_end(self.episode, logs=episode_logs) except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return (history)
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, stepper=False): if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True self.stepper = stepper callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: penalty = 0 if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if self.manual: action = int(raw_input("action?\n")) elif start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.shield is not None: if self.maze: inp = get_input_maze(observation) else: inp = get_input(observation) action_bin = to_bin(action) if not self.huge_neg: action = self.shield( inp[0], inp[1], inp[2], action_bin[0], action_bin[1], action_bin[2]) elif self.huge_neg: if to_int( self.shield( inp[0], inp[1], inp[2], action_bin[0], action_bin[1], action_bin[2])) != action: penalty = -10 if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) if self.stepper: action = int(raw_input("action?\n")) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). #print observation if self.manual: oldaction = self.forward(observation, manual=True) elif self.preemptive: banned_actions = [] inp = get_input(observation) for an_action in range(0, 8): an_action_bin = to_bin(an_action) action = to_int( self.shield.move(inp[0], inp[1], inp[2], inp[3], an_action_bin[0], an_action_bin[1], an_action_bin[2])) if action != an_action: banned_actions.append(an_action) oldaction = self.forward(observation, manual=False, banned_actions=banned_actions) else: oldaction = self.forward(observation, manual=False) # print oldaction if self.shield is not None: if self.maze: inp = get_input_maze(observation) else: inp = get_input(observation) action_bin = to_bin(oldaction) #sleep(0.01) pass if self.preemptive: action = oldaction elif not self.huge_neg: action = to_int( self.shield.move(inp[0], inp[1], inp[2], inp[3], action_bin[0], action_bin[1], action_bin[2])) elif self.huge_neg: if to_int( self.shield(inp[0], inp[1], inp[2], action_bin[0], action_bin[1], action_bin[2])) != action: penalty = -10 action = oldaction else: action = oldaction #print action, oldaction if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r + penalty, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r + penalty if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } oldstep_logs = { 'action': oldaction, 'observation': observation, 'reward': -1, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } # if correction: # callbacks.on_step_end(episode_step, oldstep_logs) # episode_step += 1 # self.step += 1 callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, gamma=.9, enable_dueling_network=False, dueling_type='avg', target_model_update=1e-2, policy=policy) # dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, # enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=.001, decay=.001), metrics=['mae']) rewards = [] callback = [TrainEpisodeLogger(), History()] hist = dqn.fit(env, nb_steps=10000, visualize=False, verbose=2, callbacks=None) rewards.extend(hist.history.get('episode_reward')) plt.plot(rewards) dqn.test(env, nb_episodes=5, visualize=True) state = env.reset() action = env.action_space.sample() print(action) state_list = [] for i in range(300): state_list.append(state) # action = np.argmax(dqn.model.predict(np.expand_dims(np.expand_dims(state, 0), 0))[0]) state, reward, done, _ = env.step(2) env.render()
def fit_new(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, arr=None): print 'FIT CHANGED ... Yayyyyy!!!!' """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.format(action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None ############### HERE ################## for ac in arr[:]: # print type(ac), ac if self.processor is not None: ac = self.processor.process_action(ac) callbacks.on_action_begin(ac) observation, reward, done, info = env.step(ac) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(ac) if done: #warnings.warn('Env ended before the deterministic non-neural steps could end.') observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break ############# # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None, episode_averaging_length=10, success_threshold=None, stopping_patience=None, min_nb_steps=500, single_cycle=True): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] for cb in callbacks: if isinstance(cb, FileLogger): save_path = cb.filepath folder_index = save_path.index("training_history.json") weights_file = os.path.join(save_path[:folder_index], "dqn_weights.h5f") if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger(interval=log_interval)] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None episode_num_errors = None did_abort = False # ------ Early stopping and reporting averages ------------------ # # It would be ideal to do this via a callback, but returning flags from callbacks seems tricky. Eish! # So, we automatically include early stopping here in the fit method. # NB: We have hardcoded in something which is probably not ideal to hard code, but I just want it # to work, and can fix things and make them nicer/more flexible at a later stage! # # -------------------------------------------------------------- if not single_cycle: recent_episode_lifetimes = deque([], episode_averaging_length) episode_lifetimes_rolling_avg = 0 best_rolling_avg = 0 best_episode = 0 time_since_best = 0 elif single_cycle: recent_episode_wins = deque([], episode_averaging_length) best_rolling_avg = 0 best_episode = 0 time_since_best = 0 rolling_win_fraction = 0 stop_training = False has_succeeded = False stopped_improving = False try: while self.step < nb_steps and not stop_training: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) # print("Episode Step:", episode_step) # print("hidden state: ") # print(env.hidden_state) # print("Board State: ") # print(observation) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # print("Episode Step:", episode_step) # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). if hasattr(env, "legal_actions"): legal_actions = list(env.legal_actions) action = self.forward(observation, legal_actions) # print("legal actions: ", legal_actions) # print("chosen action: ", action) else: action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward # print("new hidden state: ") # print(env.hidden_state) # print("new board state: ") # print(observation) # print("reward: ", r, "episode reward: ", episode_reward) # print("done: ", done) step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. action = self.forward(observation) self.backward(0., terminal=False) # Now we want to work out the recent averages, this will go into early stopping if not single_cycle: recent_episode_lifetimes.append(env.lifetime) episode_lifetimes_rolling_avg = np.mean( recent_episode_lifetimes) if episode_lifetimes_rolling_avg > best_rolling_avg: best_rolling_avg = episode_lifetimes_rolling_avg best_episode = episode time_since_best = 0 else: time_since_best = episode - best_episode if episode_lifetimes_rolling_avg > success_threshold: stop_training = True has_succeeded = True if self.step > min_nb_steps and time_since_best > stopping_patience: stop_training = True stopped_improving = True else: if episode_reward == 1: recent_episode_wins.append(1) else: recent_episode_wins.append(0) num_wins = np.sum(recent_episode_wins) rolling_win_fraction = num_wins / episode_averaging_length if rolling_win_fraction > best_rolling_avg: best_rolling_avg = rolling_win_fraction best_episode = episode time_since_best = 0 # Here I need to add something to save the net - I'm worried this will make things really slow while its improving, because it will be saving every time # For a long time. Eish! if self.step > min_nb_steps: self.save_weights(weights_file, overwrite=True) else: time_since_best = episode - best_episode if rolling_win_fraction > success_threshold: stop_training = True has_succeeded = True if self.step > min_nb_steps and time_since_best > stopping_patience: stop_training = True stopped_improving = True # This episode is finished, report and reset. if not single_cycle: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg, 'best_rolling_avg': best_rolling_avg, 'best_episode': best_episode, 'time_since_best': time_since_best, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving } else: episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, 'rolling_win_fraction': rolling_win_fraction, 'best_rolling_fraction': best_rolling_avg, 'best_episode': best_episode, 'time_since_best': time_since_best, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving } callbacks.on_episode_end(episode, episode_logs, single_cycle) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True if not single_cycle: callbacks.on_train_end(logs={ 'did_abort': did_abort, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving, 'episode_lifetimes_rolling_avg': episode_lifetimes_rolling_avg, 'step': self.step }, single_cycle=single_cycle) else: callbacks.on_train_end(logs={ 'did_abort': did_abort, 'has_succeeded': has_succeeded, 'stopped_improving': stopped_improving, 'rolling_win_fraction': rolling_win_fraction, 'step': self.step }, single_cycle=single_cycle) self._on_train_end() return history