def train_prioritized(Q_Network, train_batch, w_batch, Exp, s_scale, input_size, num_actions): state_t_batch = [item[0] for item in train_batch] state_t_batch = np.array(state_t_batch) state_t_1_batch = [item[1] for item in train_batch] state_t_1_batch = np.array(state_t_1_batch) action_batch = [item[2] for item in train_batch] reward_batch = [item[3] for item in train_batch] reward_batch = np.array(reward_batch) done_batch = [item[4] for item in train_batch] done_batch = np.array(done_batch) action_t_batch = Q_Network.evaluate_critic(state_t_batch) w_batch = np.transpose(np.tile(w_batch, (num_actions, 1))) if Exp == 'epsilon': action_t_1_batch = Q_Network.evaluate_critic(state_t_1_batch) q_t_1 = Q_Network.evaluate_target_critic(state_t_1_batch) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][action_batch[i]] = reward_batch[ i] + GAMMA * q_t_1[i][np.argmax(action_t_1_batch[i])] elif Exp == 'softmax': action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) q_t_1 = Exploration.softV(action_t_1_batch, s_scale) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][ action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i] elif Exp == 'sparsemax': action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) q_t_1 = Exploration.sparsemax(action_t_1_batch, s_scale) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][ action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i] #Q_Network.train_critic_prioritized(state_t_batch, action_t_batch, w_batch) errors, cost, _ = Q_Network.train_critic_prioritized( state_t_batch, action_t_batch, w_batch) errors = np.sum(errors, axis=1) return errors, cost
def train(Q_Network, train_batch, Exp, s_scale, input_size, num_actions): state_t_batch = [item[0] for item in train_batch] state_t_batch = np.array(state_t_batch) state_t_1_batch = [item[1] for item in train_batch] state_t_1_batch = np.array(state_t_1_batch) action_batch = [item[2] for item in train_batch] reward_batch = [item[3] for item in train_batch] reward_batch = np.array(reward_batch) done_batch = [item[4] for item in train_batch] done_batch = np.array(done_batch) action_t_batch = Q_Network.evaluate_critic(state_t_batch) if Exp == 'epsilon': action_t_1_batch = Q_Network.evaluate_critic(state_t_1_batch) q_t_1 = Q_Network.evaluate_target_critic(state_t_1_batch) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][action_batch[i]] = reward_batch[ i] + GAMMA * q_t_1[i][np.argmax(action_t_1_batch[i])] elif Exp == 'softmax': action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) q_t_1 = Exploration.softV(action_t_1_batch, s_scale) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][ action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i] elif Exp == 'sparsemax': action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) q_t_1 = Exploration.sparsemax(action_t_1_batch, s_scale) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][ action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i] # Update critic by minimizing the loss Q_Network.train_critic(state_t_batch, action_t_batch)
def train_prioritized(Q_Network, train_batch, w_batch, Exp, s_scale, input_size, num_actions, size_action_batch): state_t_batch, state_t_1_batch, action_batch, reward_batch, done_batch = zip(*train_batch) state_t_batch = np.array(state_t_batch) state_t_1_batch = np.array(state_t_1_batch) action_batch = np.array(action_batch) reward_batch = np.array(reward_batch) done_batch = np.array(done_batch) batch_size = len(done_batch) q_t_1_batch = [] if Exp == 'epsilon': # 'target-Q네트워크'로 계산한 action별 q값 계산 q_t_1_batch = Q_Network.get_target_q_batch(state_t_1_batch) q_t_1_batch = np.reshape(q_t_1_batch, [batch_size, -1]) # Exploration 기법 별로 V(next_state)를 계산 q_t_1_batch = np.max(q_t_1_batch, axis = 1) # 측정 Q(state, action)을 q_t_1_batch로 저장 q_t_1_batch = reward_batch + GAMMA*q_t_1_batch#*(1-done_batch) elif Exp == 'softmax': q_t_1_batch = Q_Network.get_target_q_batch(state_t_1_batch) q_t_1_batch = np.reshape(q_t_1_batch, [batch_size, -1]) q_t_1_batch = Exploration.softV(q_t_1_batch, s_scale) q_t_1_batch = reward_batch + GAMMA*q_t_1_batch#*(1-done_batch) elif Exp == 'sparsemax': q_t_1_batch = Q_Network.get_target_q_batch(state_t_1_batch) q_t_1_batch = np.reshape(q_t_1_batch, [batch_size, -1]) q_t_1_batch = Exploration.sparsemax(q_t_1_batch, s_scale) q_t_1_batch = reward_batch + GAMMA*q_t_1_batch#*(1-done_batch) q_t_1_batch = np.reshape(q_t_1_batch,[-1,1]) w_batch = np.reshape(w_batch,[-1,1]) # q_t_1_batch와 오차가 가장 작아지는 방향으로 Q네트워크 training, weight 적용 errors, cost, _ = Q_Network.train_critic_prioritized(state_t_batch, action_batch, q_t_1_batch, w_batch) errors = np.sum(errors, axis=1) return errors, cost, state_t_batch
def train(Q_Network, train_batch, Exp, s_scale, input_size, num_actions, size_action_batch): state_t_batch, state_t_1_batch, action_batch, reward_batch, done_batch = zip(*train_batch) state_t_batch = np.array(state_t_batch) state_t_1_batch = np.array(state_t_1_batch) action_batch = np.array(action_batch) reward_batch = np.array(reward_batch) done_batch = np.array(done_batch) q_t_1_batch = [] if Exp == 'epsilon': for i in range(0, len(train_batch)): q_t_1_batch.append(np.reshape(Q_Network.get_target_q_batch(np.reshape(state_t_1_batch[i],[1,-1])),[1,-1])[0]) q_t_1_batch = reward_batch + GAMMA*np.max(q_t_1_batch, axis=1)#*(1-done_batch) elif Exp == 'softmax': action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) q_t_1 = Exploration.softV(action_t_1_batch, s_scale) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][action_batch[i]] = reward_batch[i] + GAMMA*q_t_1[i] elif Exp == 'sparsemax': action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) q_t_1 = Exploration.sparsemax(action_t_1_batch, s_scale) for i in range(0, len(train_batch)): if done_batch[i]: action_t_batch[i][action_batch[i]] = reward_batch[i] else: action_t_batch[i][action_batch[i]] = reward_batch[i] + GAMMA*q_t_1[i] q_t_1_batch = np.reshape(q_t_1_batch,[-1,1]) cost, _ = Q_Network.train_critic(state_t_batch, action_batch, q_t_1_batch) return cost, state_t_batch
def train_error(Q_Network, train_batch, Exp, s_scale, input_size, num_actions): state_t_batch = [item[0] for item in train_batch] state_t_batch = np.array(state_t_batch) state_t_1_batch = [item[1] for item in train_batch] state_t_1_batch = np.array(state_t_1_batch) action_batch = [item[2] for item in train_batch] reward_batch = [item[3] for item in train_batch] reward_batch = np.array(reward_batch) done_batch = [1 if item[4] == False else 0 for item in train_batch] done_batch = np.array(done_batch) q_t_batch = Q_Network.evaluate_critic(state_t_batch) q_t = [q_t_batch[i][action_batch[i]] for i in range(len(train_batch))] q_t = np.array(q_t) action_t_1_batch = Q_Network.evaluate_critic(state_t_1_batch) q_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch) if Exp == 'epsilon': q_t_1 = [ q_t_1_batch[i][np.argmax(action_t_1_batch[i])] for i in range(len(train_batch)) ] q_t_1 = np.array(q_t_1) error_batch = GAMMA * q_t_1 * done_batch + reward_batch - q_t elif Exp == 'softmax': q_t_1 = Exploration.softV(q_t_1_batch, s_scale) error_batch = GAMMA * q_t_1 * done_batch + reward_batch - q_t elif Exp == 'sparsemax': #q_t_1 = Exploration.sparsemax(q_t_1_batch, s_scale) #error_batch = GAMMA*q_t_1*done_batch + reward_batch - q_t q_t_1 = [ q_t_1_batch[i][np.argmax(action_t_1_batch[i])] for i in range(len(train_batch)) ] q_t_1 = np.array(q_t_1) error_batch = GAMMA * q_t_1 * done_batch + reward_batch - q_t return error_batch