def __init__(self, name, choices, network_config, reinforce_config, log=True): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.steps = 0 self.previous_state = None self.previous_action = None self.current_reward = 0 self.total_reward = 0 self.log = log if self.log: self.summary = SummaryWriter() self.target_model = DQNModel(self.name + "_target", self.network_config) self.eval_model = DQNModel(self.name + "_eval", self.network_config) self.episode = 0
def __init__(self, name, network_config, discount_factor = 0.99, batch_size = 32): super(QPredictor, self).__init__() self.name = name self.session = tf.Session() self.eval_model = DQNModel(name + "_eval", network_config, self.session) self.target_model = DQNModel(name + "_target", network_config, self.session) self.previous_state = None self.replay_memory = Memory(5000) self.discount_factor = discount_factor self.update_frequency = 1000 self.batch_size = batch_size self.steps = 0
def __init__(self, name, choices, network_config, reinforce_config): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.reset() reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
class DQNAdaptive(object): """Adaptive which uses the DQN algorithm""" def __init__(self, name, choices, network_config, reinforce_config): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.reset() reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon) def __del__(self): self.save() self.summary.close() def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Epsilon' % self.name, scalar_value=self.epsilon, global_step=self.steps) return random.random() < self.epsilon def predict(self, state): self.steps += 1 # add to experience if self.previous_state is not None: self.memory.add(self.previous_state, self.previous_action, self.current_reward, state, 0) if self.learning and self.should_explore(): q_values = None choice = random.choice(self.choices) action = self.choices.index(choice) else: self.prediction_time -= time.time() _state = Tensor(state).unsqueeze(0) action, q_values = self.eval_model.predict(_state, self.steps, self.learning) choice = self.choices[action] self.prediction_time += time.time() if self.learning and self.steps % self.reinforce_config.replace_frequency == 0: logger.debug("Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) if (self.learning and self.steps > self.reinforce_config.update_start and self.steps % self.reinforce_config.update_steps == 0): self.update_time -= time.time() self.update() self.update_time += time.time() self.current_reward = 0 self.previous_state = state self.previous_action = action return choice, q_values def disable_learning(self): logger.info("Disabled Learning for %s agent" % self.name) self.save() self.learning = False self.episode = 0 def end_episode(self, state): if not self.learning: return episode_time = time.time() - self.episode_time self.reward_history.append(self.total_reward) self.episode_time_history.append(episode_time) total_time = sum(self.episode_time_history) avg_time = total_time / len(self.episode_time_history) logger.info("End of Episode %d, " "Total reward %.2f, " "Epsilon %.2f" % (self.episode + 1, self.total_reward, self.epsilon)) logger.debug( "Episode Time: %.2fs (%.2fs), " "Prediction Time: %.2f, " "Update Time %.2f" % (episode_time, avg_time, self.prediction_time, self.update_time)) self.episode += 1 self.summary.add_scalar(tag='%s/Episode Reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) self.memory.add(self.previous_state, self.previous_action, self.current_reward, state, 1) self.save() self.reset() def reset(self): self.episode_time = time.time() self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.prediction_time = 0 self.update_time = 0 def restore_state(self): restore_path = self.network_config.network_path + "/adaptive.info" if self.network_config.network_path and os.path.exists(restore_path): logger.info("Restoring state from %s" % self.network_config.network_path) with open(restore_path, "rb") as file: info = pickle.load(file) self.steps = info["steps"] self.best_reward_mean = info["best_reward_mean"] self.episode = info["episode"] def save(self, force=False): info = { "steps": self.steps, "best_reward_mean": self.best_reward_mean, "episode": self.episode } if (len(self.reward_history) >= self.network_config.save_steps and self.episode % self.network_config.save_steps == 0): total_reward = sum( self.reward_history[-self.network_config.save_steps:]) current_reward_mean = total_reward / self.network_config.save_steps if current_reward_mean >= self.best_reward_mean: self.best_reward_mean = current_reward_mean logger.info("Saving network. Found new best reward (%.2f)" % current_reward_mean) self.eval_model.save_network() self.target_model.save_network() with open(self.network_config.network_path + "/adaptive.info", "wb") as file: pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL) else: logger.info("The best reward is still %.2f. Not saving" % self.best_reward_mean) def reward(self, r): self.total_reward += r self.current_reward += r def update(self): if self.steps <= self.reinforce_config.batch_size: return beta = self.beta_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Beta' % self.name, scalar_value=beta, global_step=self.steps) batch = self.memory.sample(self.reinforce_config.batch_size, beta) (states, actions, reward, next_states, is_terminal, weights, batch_idxes) = batch self.summary.add_histogram(tag='%s/Batch Indices' % self.name, values=Tensor(batch_idxes), global_step=self.steps) states = FloatTensor(states) next_states = FloatTensor(next_states) terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.reinforce_config.batch_size, dtype=torch.long) # Current Q Values q_actions, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target actions, q_next = self.target_model.predict_batch(next_states) q_max = q_next.max(1)[0].detach() q_max = (1 - terminal) * q_max q_target = reward + self.reinforce_config.discount_factor * q_max # update model self.eval_model.fit(q_values, q_target, self.steps) # Update priorities td_errors = q_values - q_target new_priorities = torch.abs(td_errors) + 1e-6 # prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities.data)
class SADQAdaptive(object): """Adaptive which uses the SADQ algorithm""" def __init__(self, name, state_length, network_config, reinforce_config, is_sigmoid = False, memory_resotre = True): super(SADQAdaptive, self).__init__() self.name = name #self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config if self.reinforce_config.use_prior_memory: self.memory = PrioritizedReplayBuffer(self.reinforce_config.memory_size, 0.6) else: self.memory = ReplayBuffer(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.state_length = state_length # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.reset() self.memory_resotre = memory_resotre reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda, is_sigmoid = is_sigmoid) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda, is_sigmoid = is_sigmoid) # self.target_model.eval_mode() self.beta_schedule = LinearSchedule(self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule(self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon) def __del__(self): self.save() self.summary.close() def should_explore(self): self.epsilon = self.epsilon_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Epsilon' % self.name, scalar_value=self.epsilon, global_step=self.steps) return random.random() < self.epsilon def predict(self, state, isGreedy = False, is_random = False): if self.learning: self.steps += 1 # add to experience if self.previous_state is not None: state_crr = np.unique(state, axis=0) self.memory.add(self.previous_state, None, self.current_reward, state_crr.reshape(-1, self.state_length), 0) # q_values = FloatTensor([self.eval_model.predict(Tensor(s).unsqueeze(0), # self.steps, # self.learning)[1] for s in state]) # print(q_values) # print(self.current_reward) # input() if self.learning and self.should_explore() and not isGreedy: q_values = None choice = random.choice(list(range(len(state)))) action = choice else: # self.prediction_time -= time.time() #_state = Tensor(state).unsqueeze(0) # print(state.shape) # print(state) # input() # q_values = FloatTensor([self.eval_model.predict(Tensor(s).unsqueeze(0), # self.steps, # self.learning)[1] for s in state]) # self.eval_model.eval_mode() # state = np.unique(state, axis=0) with torch.no_grad(): q_values = FloatTensor(self.eval_model.predict_batch(Tensor(state))[1]).view(-1) # print(q_values) # print(q_values_2) # print(q_values.size()) # print(q_values_2.size()) # _, choice = q_values.max(0) _, choice = q_values.max(0) # print(choice, choice_2) # input() action = choice # self.prediction_time += time.time() if self.learning and self.steps % self.reinforce_config.replace_frequency == 0: logger.debug("Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) # self.target_model.eval_mode() if (self.learning and self.steps > self.reinforce_config.update_start and self.steps % self.reinforce_config.update_steps == 0): self.update_time -= time.time() self.update() self.update_time += time.time() self.current_reward = 0 self.previous_state = state[action] #self.previous_action = action return choice, q_values def disable_learning(self, is_save = False): logger.info("Disabled Learning for %s agent" % self.name) if is_save: self.save() self.save(force = True, appendix = "_for_now") self.learning = False self.episode = 0 def enable_learning(self): logger.info("enabled Learning for %s agent" % self.name) self.learning = True self.reset() def end_episode(self, state): if not self.learning: return # print("end:") # print(self.current_reward) # input() episode_time = time.time() - self.episode_time self.reward_history.append(self.total_reward) self.episode_time_history.append(episode_time) total_time = sum(self.episode_time_history) avg_time = total_time / len(self.episode_time_history) logger.info("End of Episode %d, " "Total reward %.2f, " "Epsilon %.2f" % (self.episode + 1, self.total_reward, self.epsilon)) logger.debug("Episode Time: %.2fs (%.2fs), " "Prediction Time: %.2f, " "Update Time %.2f" % (episode_time, avg_time, self.prediction_time, self.update_time)) self.episode += 1 self.summary.add_scalar(tag='%s/Episode Reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) self.memory.add(self.previous_state, None, self.current_reward, state.reshape(-1, self.state_length), 1) self.save() self.reset() def reset(self): self.episode_time = time.time() self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.prediction_time = 0 self.update_time = 0 def restore_state(self): restore_path = self.network_config.network_path + "/adaptive.info" if self.network_config.network_path and os.path.exists(restore_path) and self.memory_resotre: logger.info("Restoring state from %s" % self.network_config.network_path) with open(restore_path, "rb") as file: info = pickle.load(file) self.steps = info["steps"] # self.best_reward_mean = info["best_reward_mean"] self.episode = info["episode"] self.memory.load(self.network_config.network_path) print("lenght of memeory: ", len(self.memory)) def save(self, force=False, appendix=""): info = { "steps": self.steps, "best_reward_mean": self.best_reward_mean, "episode": self.episode } if (len(self.reward_history) >= self.network_config.save_steps and self.episode % self.network_config.save_steps == 0) or force: total_reward = sum(self.reward_history[-self.network_config.save_steps:]) current_reward_mean = total_reward / self.network_config.save_steps if force: #or current_reward_mean >= self.best_reward_mean: print("*************saved*****************", current_reward_mean, self.best_reward_mean) if not force: self.best_reward_mean = current_reward_mean logger.info("Saving network. Found new best reward (%.2f)" % total_reward) self.eval_model.save_network(appendix = appendix) self.target_model.save_network(appendix = appendix) self.eval_model.save_network() self.target_model.save_network() with open(self.network_config.network_path + "/adaptive.info", "wb") as file: pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL) self.memory.save(self.network_config.network_path) print("lenght of memeory: ", len(self.memory)) else: logger.info("The best reward is still %.2f. Not saving" % self.best_reward_mean) def reward(self, r): self.total_reward += r self.current_reward += r def update(self): if len(self.memory._storage) <= self.reinforce_config.batch_size: return # self.eval_model.train_mode() beta = self.beta_schedule.value(self.steps) self.summary.add_scalar(tag='%s/Beta' % self.name, scalar_value=beta, global_step=self.steps) if self.reinforce_config.use_prior_memory: batch = self.memory.sample(self.reinforce_config.batch_size, beta) (states, actions, reward, next_states, is_terminal, weights, batch_idxes) = batch self.summary.add_histogram(tag='%s/Batch Indices' % self.name, values=Tensor(batch_idxes), global_step=self.steps) else: batch = self.memory.sample(self.reinforce_config.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = FloatTensor(states) #next_states = FloatTensor(next_states) terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.reinforce_config.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) q_values = q_values.flatten() # Calculate target q_next = [self.target_model.predict_batch(FloatTensor(ns).view(-1, self.state_length))[1] for ns in next_states] q_max = torch.stack([each_qmax.max(0)[0].detach() for each_qmax in q_next], dim = 1)[0] q_max = (1 - terminal) * q_max q_target = reward + self.reinforce_config.discount_factor * q_max # update model self.eval_model.fit(q_values, q_target, self.steps) # Update priorities if self.reinforce_config.use_prior_memory: td_errors = q_values - q_target new_priorities = torch.abs(td_errors) + 1e-6 # prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities.data) def load_model(self, model): self.eval_model.replace(model) def load_weight(self, weight_dict): self.eval_model.load_weight(weight_dict)
class DQNAdaptive(object): """Adaptive which uses the DQN algorithm""" def __init__(self, name, choices, network_config, reinforce_config, log=True): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.steps = 0 self.previous_state = None self.previous_action = None self.current_reward = 0 self.total_reward = 0 self.log = log if self.log: self.summary = SummaryWriter() self.target_model = DQNModel(self.name + "_target", self.network_config) self.eval_model = DQNModel(self.name + "_eval", self.network_config) self.episode = 0 def __del__(self): pass def should_explore(self): epsilon = np.max([ 0.1, self.reinforce_config.starting_epsilon * (self.reinforce_config.decay_rate **(self.steps / self.reinforce_config.decay_steps)) ]) if self.log: self.summary.add_scalar(tag='epsilon', scalar_value=epsilon, global_step=self.steps) return np.random.choice([True, False], p=[epsilon, 1 - epsilon]) def predict(self, state): self.steps += 1 saliencies = [] # add to experience if self.previous_state is not None: experience = Experience(self.previous_state, self.previous_action, self.current_reward, state) self.replay_memory.add(experience) if self.learning and self.should_explore(): action = np.random.choice(len(self.choices)) q_values = [None] * len( self.choices ) # TODO should it be output shape or from choices? choice = self.choices[action] else: _state = Variable(torch.Tensor(state)).unsqueeze(0) q_values = self.eval_model.predict(_state) q_values = q_values.data.numpy()[0] action = np.argmax(q_values) choice = self.choices[action] if self.explanation: eb.use_eb(True) prob_outputs = Variable(torch.zeros((len(self.choices), ))) for action in range(len(self.choices)): prob_outputs[action] = 1 saliency = eb.excitation_backprop(self.eval_model.model, _state, prob_outputs, contrastive=False) saliency = np.squeeze( saliency.view(*_state.shape).data.numpy()) saliencies.append(saliency) if self.learning and self.steps % self.update_frequency == 0: logger.debug("Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) self.update() self.current_reward = 0 self.previous_state = state self.previous_action = action return choice, q_values, saliencies def disable_learning(self): logger.info("Disabled Learning for %s agent" % self.name) self.eval_model.save_network() self.target_model.save_network() self.learning = False self.episode = 0 def end_episode(self, state): if not self.learning: return logger.info("End of Episode %d with total reward %d" % (self.episode + 1, self.total_reward)) self.episode += 1 if self.log: self.summary.add_scalar(tag='%s agent reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) experience = Experience(self.previous_state, self.previous_action, self.current_reward, state, True) self.replay_memory.add(experience) self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None if self.replay_memory.current_size > 30: self.update() def reward(self, r): self.total_reward += r self.current_reward += r def update(self): if self.replay_memory.current_size < self.reinforce_config.batch_size: return batch = self.replay_memory.sample(self.reinforce_config.batch_size) states = [experience.state for experience in batch] next_states = [experience.next_state for experience in batch] states = Variable(torch.Tensor(states)) next_states = Variable(torch.Tensor(next_states)) is_terminal = [ 0 if experience.is_terminal else 1 for experience in batch ] actions = [experience.action for experience in batch] reward = [experience.reward for experience in batch] q_next = self.target_model.predict(next_states) q_max = torch.max(q_next, dim=1)[0].data.numpy() q_max = np.array( [a * b if a == 0 else b for a, b in zip(is_terminal, q_max)]) q_predict = self.eval_model.predict(states) q_target = q_predict.data.numpy() batch_index = np.arange(self.reinforce_config.batch_size, dtype=np.int32) q_target[ batch_index, actions] = reward + self.reinforce_config.discount_factor * q_max q_target = Variable(torch.Tensor(q_target)) self.eval_model.fit(states, q_target, self.steps)
class QPredictor(object): """ Predictor are equivalent to General Value Functions (GVFs) """ #TODO # * discount factor how to decide? # * batch size should it be the same? # * save predictor def __init__(self, name, network_config, discount_factor = 0.99, batch_size = 32): super(QPredictor, self).__init__() self.name = name self.session = tf.Session() self.eval_model = DQNModel(name + "_eval", network_config, self.session) self.target_model = DQNModel(name + "_target", network_config, self.session) self.previous_state = None self.replay_memory = Memory(5000) self.discount_factor = discount_factor self.update_frequency = 1000 self.batch_size = batch_size self.steps = 0 def __del__(self): self.eval_model.save_network() self.target_model.save_network() self.session.close() def learn(self, current_state, action, reward, is_terminal, terminal_reward): self.steps += 1 if is_terminal: reward = terminal_reward if action is None: action = 0 if self.previous_state is not None: experience = Experience(self.previous_state, action, reward, current_state, is_terminal) self.replay_memory.add(experience) if self.steps % self.update_frequency == 0: logger.info("Predictor -- Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) self.previous_state = current_state self.update() def update(self): if self.replay_memory.current_size < self.batch_size: return batch = self.replay_memory.sample(self.batch_size) # TODO: Convert to tensor operations instead of for loops states = [experience.state for experience in batch] next_states = [experience.next_state for experience in batch] is_terminal = [ 0 if experience.is_terminal else 1 for experience in batch] actions = [experience.action for experience in batch] reward = [experience.reward for experience in batch] q_next = self.target_model.predict_batch(next_states) q_mean = np.mean(q_next, axis = 1) q_mean = np.array([ a * b if a == 0 else b for a,b in zip(is_terminal, q_mean)]) q_values = self.eval_model.predict_batch(states) q_target = q_values.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) q_target[batch_index, actions] = reward + self.discount_factor * q_mean self.eval_model.fit(states, q_target, self.steps) def predict(self, state): action, q_values = self.eval_model.predict(state) return q_values