def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init, gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False): self.step_now = 0 # record the step self.reward_num = 0 self.reward_accumulated = 0 # delay reward self.final_tem = 10 # just for now self.step_last_update = 0 # record the last update time self.update_period = update_period # for the off policy self.update_cont = 0 self.learn_start_time = learn_start_time self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.alpha = 0.6 self.beta = 0.4 self.replay_bata_iters = replay_iters self.replay_eps = 1e-6 self.loss_back = 0 self.q_value_p = 0 self.memory_min_num = 400 #she min num to learn self.step_last_learn = 0 # record the last learn step self.learn_fre = learn_fre # step frequency to learn self.e_greedy = 1 # record the e_greedy self.eps_T = eps_T # par for updating the maybe step 80,0000 self.eps_t_init = eps_t_init # par for updating the eps self.device = device self.model_path = model_path self.mode_enjoy = model_load if model_load == False: self.policy_net = DQN(board[0], board[1], action_num).to(device) self.target_net = DQN(board[0], board[1], action_num).to(device) #self.target_net.eval() self.optimizer = optim.SGD(self.policy_net.parameters(), lr=lr) self.loss_fn = nn.functional.mse_loss # use the l1 loss self.memory = Memory(memory_size) else: self.load(o_model_name) #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) self.obs_new = None self.obs_old = None self.action = None self.action_old = None self.dqn_direct_flag = False # show if the dqn action is done self.model_save_flag = False
def __init__(self, state_size, action_size, is_double_Q, is_prioritized, seed = 456): """ This is for initialization. (input) - state_size (int): size of a state - action_size (int): dim of the action space - seed (int): random seed - is_double_Q (bool): double Q-learning (True) or normal Q-learning - is_prioritized (bool): prioritized replay buffer (True) or normal replay buffer - seed (int): random seed """ self.seed = torch.manual_seed(seed) self.state_size = state_size self.action_size = action_size # Q-networks, local and target self.qnetwork_local = DQN(self.state_size, self.action_size, seed).to(device) self.qnetwork_target = DQN(self.state_size, self.action_size, seed).to(device) self.is_double_Q = is_double_Q # optimizer for learning process self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr = LR) # replay buffer for learning process self.is_prioritized = is_prioritized self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.is_prioritized, seed) self.beta = BETA0_PRIORITIZED # power of the weights for the prioritized replay buffer self.learning_count = 0 # count how many times the learning process is done (used for the update of beta) # the number of time step (modulo UPDATED_EVERY) self.t_step = 0 # loss self.loss = 0.0
class RL_AGENT_ONE(): """ RL agent class """ def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init, gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False ): self.step_now = 0 # record the step self.reward_num = 0 self.reward_accumulated = 0 # delay reward self.final_tem = 10 # just for now self.step_last_update = 0 # record the last update time self.update_period = update_period # for the off policy self.learn_start_time = learn_start_time self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.alpha = 0.6 self.beta = 0.4 self.replay_bata_iters = replay_iters self.replay_eps = 1e-6 self.loss_back = 0 self.q_value_p = 0 self.memory_min_num = 1000 #she min num to learn self.step_last_learn = 0 # record the last learn step self.learn_fre = learn_fre # step frequency to learn self.e_greedy = 1 # record the e_greedy self.eps_T = eps_T # par for updating the maybe step 80,0000 self.eps_t_init = eps_t_init # par for updating the eps self.device = device self.model_path = model_path self.mode_enjoy = model_load if model_load == False: self.policy_net = DQN(board[0], board[1], action_num).to(device) self.target_net = DQN(board[0], board[1], action_num).to(device) #self.target_net.eval() self.optimizer = optim.SGD(self.policy_net.parameters(), lr=lr) self.loss_fn = nn.functional.mse_loss # use the l1 loss self.memory = Memory(memory_size) self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0) else: self.load(o_model_name) #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) self.obs_new = None self.obs_old = None self.action = None self.action_old = None self.dqn_direct_flag = False # show if the dqn action is done self.model_save_flag = False def reset(self): """ reset the flag, state, reward for a new half or game """ self.obs_new = None self.obs_old = None self.action = None self.dqn_direct_flag = False def load(self, old_model): """ load the trained model par: |old_model:str, the name of the old model """ model_path_t = self.model_path + 't' + old_model self.target_net = torch.load(model_path_t, map_location=self.device) self.target_net.eval() #print('target net par', self.target_net.state_dict()) def save(self): """ save the trained model """ #if self.model_path in os.listdir() t = time.strftime('%m%d%H%M%S') self.model_path_p = self.model_path + 'p' + t + '.pt' self.model_path_t = self.model_path + 't' + t + '.pt' #print('target net par is', self.policy_net.state_dict()) torch.save(self.policy_net, self.model_path_p) torch.save(self.target_net, self.model_path_t) def learn(self, env, step_now, obs_old, action, obs_new, reward, done): """ This func is used to learn the agent par: |step_now: int, the global time of training |env: class-Environment, use it for nothing |transition: action, obs_new, reward |obs_old/new: instance obs |done: bool, if the game is over """ """ check if we should update the policy net """ if step_now - self.step_last_update == self.update_period: #print('update the p t network') self.step_last_update = step_now self.target_net.load_state_dict(self.policy_net.state_dict()) """ init the obs_new for init learn """ state_new = self.feature_combine(obs_new) # get the feature state state_old = self.feature_combine(obs_old) # get the feature state transition_now = (state_old, action, \ reward, state_new) """ augument reward data to the memory """ self.memory.episode_add(state_old, action, reward, state_new, done) """ select the batch memory to update the network """ step_diff = step_now - self.step_last_learn if step_now > self.learn_start_time and \ step_diff >= self.learn_fre and \ self.memory.__len__() > self.memory_min_num: self.step_last_learn = step_now # update the self.last learn batch_data = self.memory.sample(self.batch_size) s_o_set = [] actions = [] rewards = [] s_n_set = [] dones = [] for bd in batch_data: s_o_set.append(bd.state) actions.append(bd.action) rewards.append(bd.reward) s_n_set.append(bd.next_state) dones.append(bd.done) loss_list = [] batch_idx_list = [] reward_not_zero_cnt = 0 actions = torch.tensor(actions, device=self.device) """ cnt how many times learn for non reward """ with torch.no_grad(): target_values = [self.gamma*self.target_net(s_n).max(0)[0] \ for idx, s_n in enumerate(s_n_set)] target_values = [t_*(1 - d_) + r_ \ for t_, d_, r_ in zip(target_values, dones, rewards)] policy_values = [self.policy_net(s).gather(0, a) \ for s, a in zip(s_o_set, actions)] loss = [self.loss_fn(t_v, p_v)+ self.replay_eps \ for p_v, t_v in zip(policy_values, target_values)] loss_back = sum(loss) / self.batch_size self.loss_back = loss_back """ update the par """ self.optimizer.zero_grad() loss_back.backward() self.optimizer.step() """ check if we should save the model """ if self.model_save_flag == True: self.save() def check_train(self, env): """ check if the reward is backward""" pass def select_egreedy(self, q_value, step_now): """ select the action by e-greedy policy arg: |q_value: the greedy standard """ self.e_greedy = np.exp((self.eps_t_init - step_now) / self.eps_T) if self.e_greedy < 0.3: self.e_greedy = 0.3 """ if we are in enjoying mode """ if self.mode_enjoy == True: print('q_value is', q_value) self.e_greedy = 0.3 """ select the action by e-greedy """ if np.random.random() > self.e_greedy: action = action_list[q_value.max(0)[1]] else: action = action_list[np.random.randint(action_num)] return action def feature_combine(self, obs): """ This file extract features from the obs.layers and combine them into a new feature layer Used feature layers: """ """ combine all the layers """ feature_c = obs.copy() feature_c = feature_c.astype(np.float32) feature_c = torch.tensor(feature_c, dtype=torch.float32, device=self.device) return feature_c def data_augment(self, transition): """ use this func to flip the feature, to boost the experience, deal the problem of sparse reward par: |transition: tuple, with (feature_o, action, feature_n, reward) """ flip_ver_dim = 2 feature_old = transition[0] action = transition[1] feature_new = transition[3] reward = transition[2] """ vertical flip """ feature_o_aug = feature_old.flip([flip_ver_dim]) feature_n_aug = feature_new.flip([flip_ver_dim]) """ vertical :action flip """ if action == 0: action = 1 elif action == 1: action = 0 return feature_o_aug, action, reward, feature_n_aug def act(self, map, step_now): """ this func is interact with the competition func """ dqn_action = -1 # reset state_old = self.feature_combine(map) # get the feature with torch.no_grad(): q_values = self.policy_net(state_old) action = self.select_egreedy( \ q_values, step_now)# features to model return action def act_enjoy(self, map): """ this func is interact with the competition func """ dqn_action = -1 # reset step_now = self.eps_T state_old = self.feature_combine(map) # get the feature q_values = self.target_net(state_old) action = self.select_egreedy( \ q_values, step_now)# features to model return action
class Agent(): """ This module is used for interacting with and learn from the environement. """ def __init__(self, state_size, action_size, is_double_Q, is_prioritized, seed = 456): """ This is for initialization. (input) - state_size (int): size of a state - action_size (int): dim of the action space - seed (int): random seed - is_double_Q (bool): double Q-learning (True) or normal Q-learning - is_prioritized (bool): prioritized replay buffer (True) or normal replay buffer - seed (int): random seed """ self.seed = torch.manual_seed(seed) self.state_size = state_size self.action_size = action_size # Q-networks, local and target self.qnetwork_local = DQN(self.state_size, self.action_size, seed).to(device) self.qnetwork_target = DQN(self.state_size, self.action_size, seed).to(device) self.is_double_Q = is_double_Q # optimizer for learning process self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr = LR) # replay buffer for learning process self.is_prioritized = is_prioritized self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.is_prioritized, seed) self.beta = BETA0_PRIORITIZED # power of the weights for the prioritized replay buffer self.learning_count = 0 # count how many times the learning process is done (used for the update of beta) # the number of time step (modulo UPDATED_EVERY) self.t_step = 0 # loss self.loss = 0.0 def reset(self): """ This method is used for resetting time_step """ self.t_step = 0 def step(self, state, action, reward, next_state, done): """ This method saves an experience to the replay buffer. Then, after a fixed number of iterations, the Q-network learns from the experiences stored in the replay buffer if it contains more experiences than the batch size. (input) - state (float, dim = state_size): state vector - action (int, dim = action_size): action vector - reward (float, dim 1): reward - next_state(float, dim = state_size): state vector for the next state - done (bool): if the episode is done or not """ # create tensors storing states (same for actions etc.) states = torch.from_numpy(np.vstack([state])).float().to(device) actions = torch.from_numpy(np.vstack([action])).long().to(device) rewards = torch.from_numpy(np.vstack([reward])).float().to(device) next_states = torch.from_numpy(np.vstack([next_state])).float().to(device) dones = torch.from_numpy(np.vstack([done]).astype(np.uint8)).float().to(device) # compute the TD error self.qnetwork_local.eval() if(self.is_double_Q != True): # for normal Q-learning # get the maxium Q-values of target network for next_state qsa_target_next_max = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) else: # for double Q-learning # get actions which maximize the Q-values of the local network for next_states. self.qnetwork_local.eval() with torch.no_grad(): max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) self.qnetwork_local.train() # get the Q-values of target network for next_state, max_actions qsa_target_next_max = self.qnetwork_target(next_states).gather(1, max_actions) delta = rewards + GAMMA * qsa_target_next_max * (1-dones) - self.qnetwork_local(states).gather(1, actions) delta = delta.data.cpu().numpy()[0][0] self.qnetwork_local.train() # save experience in the replay buffer self.buffer.add(state, action, reward, next_state, done, delta) # update the weights of Q-networks every UPDATE_EVERY time steps # (and if the replay buffer contains more experiences thant the batch size) self.t_step = (self.t_step + 1) % UPDATE_EVERY if(self.t_step == 0): if(len(self.buffer) > BATCH_SIZE): self.learning_count += 1 # in case of prioritized buffer, update beta if self.is_prioritized: self.beta = 1.0 + (BETA0_PRIORITIZED - 1.0)/self.learning_count # learn and update the weights of Q-networks self.learn(self.beta) def act(self, state, eps): """ This method takes a state as an input, uses the policy defined by deep Q-Network and then select the next action based on epsilon-greedy algorithm. (input) - state (float, dim = state_size): state vector - eps (float): epsilon for the epsilon-greedy algorithm (output) - index for the next action (int) """ # convert state a tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) ## retrieve the action value self.qnetwork_local.eval() # evaluation mode with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # return to the training model # choose an action based on epsilon-greedy algorithm if(random.random() > eps): return np.argmax(action_values.cpu().data.numpy()) else: return np.argmax(random.choice(np.arange(self.action_size))) def learn(self, beta = 1.0): """ This method updates the weights of the Q-network by learning from the experiences stored in the replay buffer. (input) - beta (float): beta index for the priority replay """ # sampling from the replay buffer states, actions, rewards, next_states, dones, deltas = self.buffer.sample(beta) if(self.is_double_Q != True): # for normal Q-learning # get the maxium Q-values of target network for next_state qsa_target_next_max = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) else: # for double Q-learning # get actions which maximize the Q-value of local network for next_state self.qnetwork_local.eval() with torch.no_grad(): max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) self.qnetwork_local.train() # get the Q-values of target network for next_state, max_actions qsa_target_next_max = self.qnetwork_target(next_states).gather(1, max_actions) # compute target/expected Q-values qsa_target = rewards + GAMMA * qsa_target_next_max * (1- dones) qsa_expect = self.qnetwork_local(states).gather(1, actions) # in case of the prioritized replay buffer, multiply the gradient # (and thus temporarily the target and expected Q-value) with weights if(self.is_prioritized): # multiply the weights with the target/expected Q-values # Note that since self.buffer.weights is the square-root of the # weights considered in the prioritized replay buffer paper, # the mean square error evaluated for qsa_target and qsa_expect below # gives an appropriate gradient for the update of the network weights # (i.e. multiplied by self.buffer.weights compared to the standard replay buffer case) qsa_target = qsa_target * self.buffer.weights qsa_expect = qsa_expect * self.buffer.weights # update the deltas and priorities in the sampled experiences in the replay buffer deltas = qsa_target - qsa_expect deltas = deltas.data.cpu().numpy().squeeze(1) for i, j in enumerate(self.buffer.id_experiences): self.buffer.memory[j]._replace(delta = deltas[i]) self.buffer.priority[j] = np.power(np.abs(deltas[i]) + EPS_PRIORITIZED, ALPHA_PRIORITIZED) # compute the mean square error as a loss and do back-propagation self.loss = F.mse_loss(qsa_expect, qsa_target) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() # soft-update of the parameters in the target network self.soft_update(TAU) def soft_update(self, tau): """ This method carries out the soft-update of the parameters in the target network. (input) tau (float): parameter for the soft-update """ for l_param, t_param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)
class RL_AGENT_ONE(): """ RL agent class """ def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init, gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False): self.step_now = 0 # record the step self.reward_num = 0 self.reward_accumulated = 0 # delay reward self.final_tem = 10 # just for now self.step_last_update = 0 # record the last update time self.update_period = update_period # for the off policy self.learn_start_time = learn_start_time self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.alpha = 0.6 self.beta = 0.4 self.replay_bata_iters = replay_iters self.replay_eps = 1e-6 self.memory_min_num = 1000 #she min num to learn self.step_last_learn = 0 # record the last learn step self.learn_fre = learn_fre # step frequency to learn self.e_greedy = 1 # record the e_greedy self.eps_T = eps_T # par for updating the maybe step 80,0000 self.eps_t_init = eps_t_init # par for updating the eps self.device = device self.model_path = model_path self.mode_enjoy = model_load if model_load == False: self.policy_net = DQN(board[0], board[1], action_num).to(device) self.target_net = DQN(board[0], board[1], action_num).to(device) self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr) self.loss_fn = nn.functional.mse_loss # use the l1 loss self.memory = PrioritizedReplayBuffer(memory_size, self.alpha) self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0) else: self.load(o_model_name) #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) self.obs_new = None self.obs_old = None self.action = None self.action_old = None self.dqn_direct_flag = False # show if the dqn action is done self.model_save_flag = False def reset(self): """ reset the flag, state, reward for a new half or game """ self.obs_new = None self.obs_old = None self.action = None self.dqn_direct_flag = False def load(self, old_model): """ load the trained model par: |old_model:str, the name of the old model """ model_path_t = self.model_path + 't' + old_model self.target_net = torch.load(model_path_t, map_location=self.device) self.target_net.eval() print('target net par', self.target_net.state_dict()) def save(self): """ save the trained model """ #if self.model_path in os.listdir() t = time.strftime('%m%d%H%M%S') self.model_path_p = self.model_path + 'p' + t + '.pt' self.model_path_t = self.model_path + 't' + t + '.pt' print('target net par is', self.policy_net.state_dict()) torch.save(self.policy_net, self.model_path_p) torch.save(self.target_net, self.model_path_t) def learn(self, env, step_now, obs_old, action, obs_new, reward, done): """ This func is used to learn the agent par: |step_now: int, the global time of training |env: class-Environment, use it for nothing |transition: action, obs_new, reward |obs_old/new: instance obs |done: bool, if the game is over """ """ check if we should update the policy net """ if step_now - self.step_last_update == self.update_period: #print('update the p t network') self.step_last_update = step_now self.target_net.load_state_dict(self.policy_net.state_dict()) """ init the obs_new for init learn """ state_new = self.feature_combine(obs_new) # get the feature state state_old = self.feature_combine(obs_old) # get the feature state transition_now = (state_old, action, \ reward, state_new) """ augument reward data to the memory """ if reward > 0: #print('get one reward in learn', self.reward_accumulated) self.memory.add(*self.data_augment(transition_now), done) self.memory.add(state_old, action, \ reward, state_new, done) """ select the batch memory to update the network """ step_diff = step_now - self.step_last_learn if step_now > self.learn_start_time and \ step_diff >= self.learn_fre and \ self.memory.__len__() > self.memory_min_num: self.step_last_learn = step_now # update the self.last learn #print('start sample', step_now) batch_data = self.memory.sample(self.batch_size, \ beta=self.beta_schedule.value(step_now)) loss_list = [] batch_idx_list = [] reward_not_zero_cnt = 0 for t_ in zip(*batch_data): s, action, reward, s_new, done, weight, idx = t_ #print('the transition is', t_) """ cnt how many times learn for non reward """ q_values = self.policy_net(s_new) action_new = self.select_egreedy( q_values.cpu().detach().numpy(), self.eps_T * 2) #reward = torch.tensor(reward).to(self.device) #target_value =reward if done \ #else self.gamma*self.target_net(s_new)[action_new] + reward target_value = self.gamma*self.target_net(s_new).gather( \ 0, torch.tensor(action_new, device=self.device)) target_value = target_value + reward if not done \ else target_value - (target_value - reward) loss = self.loss_fn(self.policy_net(s).gather(0, \ torch.tensor(action, device=self.device)), target_value) loss_list.append(loss + self.replay_eps) batch_idx_list.append(idx) """ update the par """ self.optimizer.zero_grad() loss.backward() #print('self.policy_net linear grad is', self.policy_net.fc_3.bias.grad) #print('self.policy_net linear grad is', self.policy_net.fc_3.weight.grad) #input() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() #input() self.memory.update_priorities(batch_idx_list, loss_list) """ check if we should save the model """ if self.model_save_flag == True: self.save() def select_egreedy(self, q_value, step_now): """ select the action by e-greedy policy arg: |q_value: the greedy standard """ self.e_greedy = np.exp((self.eps_t_init - step_now) / self.eps_T) """ if we are in enjoying mode """ if self.mode_enjoy == True: print('q_value is', q_value) self.e_greedy = 0 """ select the action by e-greedy """ if np.random.random() > self.e_greedy: action = action_list[ \ np.where(q_value==np.max(q_value))[0][0] ] else: action = action_list[np.random.randint(action_num)] return action def feature_combine(self, obs): """ This file extract features from the obs.layers and combine them into a new feature layer Used feature layers: """ """ combine all the layers """ feature_c = obs.copy() feature_c = feature_c.astype(np.float32) feature_c = torch.tensor(feature_c, dtype=torch.float32, device=self.device) size = feature_c.shape feature_c = feature_c.resize_(1, 1, size[0], size[1]) return feature_c def data_augment(self, transition): """ use this func to flip the feature, to boost the experience, deal the problem of sparse reward par: |transition: tuple, with (feature_o, action, feature_n, reward) """ flip_ver_dim = 2 feature_old = transition[0] action = transition[1] feature_new = transition[3] reward = transition[2] """ vertical flip """ feature_o_aug = feature_old.flip([flip_ver_dim]) feature_n_aug = feature_new.flip([flip_ver_dim]) """ vertical :action flip """ if action == 0: action = 1 elif action == 1: action = 0 return feature_o_aug, action, reward, feature_n_aug def act(self, map, step_now): """ this func is interact with the competition func """ dqn_action = -1 # reset state_old = self.feature_combine(map) # get the feature q_values = self.policy_net(state_old) action = self.select_egreedy( \ q_values.cpu().detach().numpy(), step_now)# features to model return action def act_enjoy(self, map): """ this func is interact with the competition func """ dqn_action = -1 # reset step_now = self.eps_T state_old = self.feature_combine(map) # get the feature q_values = self.target_net(state_old) action = self.select_egreedy( \ q_values.cpu().detach().numpy(), step_now)# features to model return action