def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1, plt=""): if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset(test=True)) nb_max_episode_steps = (env.num_steps - 1) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done = self.processor.process_step( observation, r, done) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) nb_max_episode_steps = (env.num_steps - 1) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. # accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d = self.processor.process_step( observation, r, d) callbacks.on_action_end(action) reward += r #for key, value in info.items(): # if not np.isreal(value): # continue # if key not in accumulated_info: # accumulated_info[key] = np.zeros_like(value) # accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward # print(episode_reward) step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'date': env.date, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, 'date': env.date, } callbacks.on_episode_end(episode, env, episode_logs) #help if (episode != nb_episodes - 1) and (plt != ""): env.graph(plt) if (plt != ""): env.graph(plt, end=True) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
train_sample_size = 50, train_rounds = 40, trains_between_updates = 1 ) def updateState(self, observation): return self.Env.validActions() # Get the environment and extract the number of actions. stock = Stock(100.0, 0.2, 0.0, 252) env = Stock3d2uEnv(stock, 1000.0, 0.0, 252) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = create_model(env.observation_space.shape[-1], env.action_space.n) qnet = QNet(model, soft_update=0.01) qnet.compile(Adagrad(), ["mse"]) agent = PredictedAgent(env, qnet) for i in range(1000): agent.fit(max_steps=1000, callbacks=[TrainCallback()]) agent.TrainPolicy.Epsilon = [0.5, 0.1][i%2] print "epsilon:", agent.TrainPolicy.Epsilon agent.test(max_episodes=1, callbacks=[TestLogger(), ActionCallback()]) agent.test(max_episodes=10, callbacks=[Visualizer(), TestLogger(), ActionCallback()])
def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): # env.close() if not self.compiled: raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.step = np.int16(0) observation = None episode_reward = None episode_step = None did_abort = False try: while self.step < nb_steps: #nb_max_episode_steps = env.num_steps-1 if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward = np.float32(0) # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) nb_max_episode_steps = (env.num_steps - 1) if self.processor is not None: observation = self.processor.process_observation( observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done = env.step(action) #print("action") observation = deepcopy(observation) if self.processor is not None: observation, reward, done = self.processor.process_step( observation, reward, done) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) nb_max_episode_steps = (env.num_steps - 1) if self.processor is not None: observation = self.processor.process_observation( observation) break # At this point, we expect to be fully initialized. assert episode_reward is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = np.float32(0) # accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done = env.step(action) #print(observation,r,done) observation = deepcopy(observation) if self.processor is not None: observation, r, done = self.processor.process_step( observation, r, done) # for key, value in info.items(): # if not np.isreal(value): # continue # if key not in accumulated_info: # accumulated_info[key] = np.zeros_like(value) # accumulated_info[key] += value callbacks.on_action_end(action) # print(r) reward += r if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics = self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'date': env.date, } # print(episode_reward) callbacks.on_step_end(episode_step, step_logs) episode_step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None env.sim.report() env.sim.close() self.step += 1 except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() #env.graph("plots/test_plot_") return history
def evaluate(self, env, num_episodes, action_repetition=1, max_episode_length=None, num_burn_in=10, callbacks=None, verbose=1, visualize=False): """Test your agent with a provided environment. You shouldn't update your network parameters here. Also if you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. Basically run your policy on the environment and collect stats like cumulative reward, average episode length, etc. You can also call the render function here if you want to visually inspect your policy. """ if not self.compiled: raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.') self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'num_episodes': num_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(num_episodes): # new episode callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) assert observation is not None # by using num_burn_in to slightly change the starting position at the beginning of the game print('Performing random action to change starting position at the beginning') for i in range(num_burn_in): action = env.action_space.sample() callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) callbacks.on_action_end(action) if done: warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `num_burn_in` parameter.'.format(num_burn_in)) observation = deepcopy(env.reset()) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) observation_tmp = self.preprocessor.Atari.process_state_for_memory(observation) # cache the observation before action for saving to memory action = self.select_action(observation_tmp) # print(env.get_action_meanings()[action]) # action repetition is for skipping frame by executing same action multiple times # but since we are using environment v0, no need to skip frame manually becaue random # skipping is executed by the environment itself reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) callbacks.on_action_end(action) reward += r for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value if d: done = True break if max_episode_length and episode_step >= max_episode_length - 1: done = True episode_reward += reward # reward is not clipped in the evaluation step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
def fit(self, env, callbacks, num_iterations, action_repetition=1, max_episode_length=None, log_interval=10000, verbose=1, visualize=False, validation_data=None): """Fit your model to the provided environment. Its a good idea to print out things like loss, average reward, Q-values, etc to see if your agent is actually improving. You should probably also periodically save your network weights and any other useful info. This is where you should sample actions from your network, collect experience samples and add them to your replay memory, and update your network parameters. Parameters ---------- env: gym.Env This is your Atari environment. You should wrap the environment using the wrap_atari_env function in the utils.py num_iterations: int How many samples/updates to perform. max_episode_length: int How long a single episode should last before the agent resets. Can help exploration. """ if not self.compiled: raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') self.training = True self.validation_data = validation_data # for callback to record the log callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'num_iterations': num_iterations, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) callbacks.on_train_begin() # start training episode = 0 self.step = 0 observation = None episode_reward = None episode_step = None did_abort = False try: while (self.step < num_iterations): if observation is None: # new episode callbacks.on_episode_begin(episode) episode_step = 0 episode_reward = 0. self.reset_states() observation = deepcopy(env.reset()) assert observation is not None assert episode_reward is not None assert episode_step is not None assert observation is not None # execute a new step callbacks.on_step_begin(episode_step) observation_tmp = self.preprocessor.Atari.process_state_for_memory(observation) # cache the observation before action for saving to memory action = self.select_action(observation_tmp) # run network forward to get a action # action repetition is for skipping frame by executing same action multiple times # but since we are using environment v0, no need to skip frame manually becaue random # skipping is executed by the environment itself reward = 0. accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) r = self.preprocessor.Atari.process_reward(r) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward += r if done: break if max_episode_length and episode_step >= max_episode_length - 1: # Force a terminal state. done = True # save the current tuple to memory self.memory.append(observation_tmp, action, reward, done) metrics = self.update_policy() weights = self.q_network.get_weights() episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) callbacks.pass_loss(self.step, step_logs) episode_step += 1 self.step += 1 if done: observation_tmp = self.preprocessor.Atari.process_state_for_memory(observation) # cache the observation before action for saving to memory action = self.select_action(observation_tmp) # one more step to a new episode self.update_policy() self.memory.append(observation_tmp, action, 0., False) self.preprocessor.History.reset() # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': self.step, } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None except KeyboardInterrupt: did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self._on_train_end() return history
def updateState(self, observation): return self.Env.validActions() # Get the environment and extract the number of actions. stock = Stock(100.0, 0.2, 0.0, 252) env = PredictedEnv(stock, 1000.0, 0.0, 252) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = create_model(env.observation_space.shape[-1], env.action_space.n) qnet = QNet(model) qnet.compile(Adagrad(), ["mse"]) agent = PredictedAgent(env, qnet) for _ in range(2000): agent.fit(max_steps=50000) agent.TrainPolicy.Epsilon = max(agent.TrainPolicy.Epsilon * 0.8, 0.1) print "epsilon:", agent.TrainPolicy.Epsilon agent.test(max_episodes=1, callbacks=[TestLogger(), ActionCallback()]) agent.test(max_episodes=10, callbacks=[Visualizer(), TestLogger(), ActionCallback()])
def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): """Callback that is called before training begins. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_episodes (integer): Number of episodes to perform. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not self.compiled: raise RuntimeError( 'Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.' ) if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format( action_repetition)) self.training = False self.step = 0 callbacks = [] if not callbacks else callbacks[:] if verbose >= 1: callbacks += [TestLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_episodes': nb_episodes, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self._on_test_begin() callbacks.on_train_begin() for episode in range(nb_episodes): callbacks.on_episode_begin(episode) episode_reward = 0. episode_step = 0 # Obtain the initial observation by resetting the environment. self.reset_states() observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint( nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.processor is not None: action = self.processor.process_action(action) callbacks.on_action_begin(action) observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, done, info = self.processor.process_step( observation, r, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.' .format(nb_random_start_steps)) observation = deepcopy(env.reset()) if self.processor is not None: observation = self.processor.process_observation( observation) break # Run the episode until we're done. done = False while not done: callbacks.on_step_begin(episode_step) action = self.forward(observation) if self.processor is not None: action = self.processor.process_action(action) reward = 0. accumulated_info = {} for _ in range(action_repetition): callbacks.on_action_begin(action) observation, r, d, info = env.step(action) observation = deepcopy(observation) if self.processor is not None: observation, r, d, info = self.processor.process_step( observation, r, d, info) callbacks.on_action_end(action) reward += r # for key, value in info.items(): # if not np.isreal(value): # continue # if key not in accumulated_info: # accumulated_info[key] = np.zeros_like(value) # accumulated_info[key] += value if d: done = True break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: done = True self.backward(reward, terminal=done) episode_reward += reward step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.step += 1 # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.forward(observation) self.backward(0., terminal=False) # Report end of episode. episode_logs = { 'episode_reward': episode_reward, 'nb_steps': episode_step, } callbacks.on_episode_end(episode, episode_logs) callbacks.on_train_end() self._on_test_end() return history
dqn.fit( env, callbacks=callbacks, nb_steps=TRAINING_STEPS, log_interval=10000 ) # After training is done, we save the final weights. dqn.save_weights(WEIGHT_FINAL_FILE, overwrite=True) elif MODE == 'vis': dqn.load_weights(WEIGHT_FINAL_FILE) vis = Visualizer(name=MODEL_NAME) dqn.test(env, nb_episodes=100, visualize=False, callbacks=[vis]) vis.write() elif MODE == 'test': from glob import glob import re patt = './{}/{}_*.h5f'.format(MODEL_DIR, MODEL_NAME) print(patt) # Every filename should have a contain a number denoting the step ws = sorted(glob(patt), key=lambda fn: int(re.findall(r"(\d+)\.h5f", fn)[0])) epochtest = EpochTest(MODEL_NAME) vis = Visualizer(name=MODEL_NAME) for w in ws: epoch = int(re.findall(r"(\d+)\.h5f", w)[0]) print('Epoch: {}'.format(epoch))