def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN, train_batch: list) -> float: """Trains `mainDQN` with target Q values given by `targetDQN` Args: mainDQN (dqn.DQN): Main DQN that will be trained targetDQN (dqn.DQN): Target DQN that will predict Q_target train_batch (list): Minibatch of replay memory Each element is (s, a, r, s', done) [(state, action, reward, next_state, done), ...] Returns: float: After updating `mainDQN`, it returns a `loss` """ states = np.vstack([x[0] for x in train_batch]) actions = np.array([x[1] for x in train_batch]) rewards = np.array([x[2] for x in train_batch]) next_states = np.vstack([x[3] for x in train_batch]) done = np.array([x[4] for x in train_batch]) X = states Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states), axis=1) * ~done y = mainDQN.predict(states) y[np.arange(len(X)), actions] = Q_target # Train our network using target and predicted Q values on each episode return mainDQN.update(X, y)
def train_sample_replay1(mainDQN: m.DQN, targetDQN: m.DQN, train_batch: list) -> float: """Prepare X_batch, y_batch and train them Recall our loss function is target = reward + discount * max Q(s',a) or reward (if done early) Loss function: [target - Q(s, a)]^2 Hence, X_batch is a state list y_batch is reward + discount * max Q or reward (if terminated early) Args: mainDQN (m.DQN): DQN Agent to train & run targetDQN (m.DQN): DQN Agent to set target Q value train_batch (list): Mini batch of "Sample" Replay memory Each element is a tuple of (state, action, reward, next_state, done) Returns: loss: Returns a loss """ x_stack = np.empty(0).reshape(0, mainDQN.input_size) y_stack = np.empty(0).reshape(0, mainDQN.output_size) # get stored information from the buffer for state, action, reward, next_state, done in train_batch: # 학습시키고자 하는 Q값 (Q 목표값) Q = mainDQN.predict(state) # 현재 상태의 Q 값 # todo: target DQN 이용 if done: # Q 목표값 update Q[0, action] = reward else: Q[0, action] = reward \ + dis * np.max(targetDQN. predict(next_state)) x_stack = np.vstack([x_stack, state]) y_stack = np.vstack([y_stack, Q]) # Train our network using target and predicted Q values on each episode # DQN 2015!!! # Q 목표값은 target DQN을 이용해서 얻고, # Q-net(weight)의 update는 main DQN을 이용해서 한다. cost, _ = mainDQN.update(x_stack, y_stack) return cost
def train_sample_replay(mainDQN: m.DQN, targetDQN: m.DQN, train_batch: list) -> float: states = np.vstack([x[0] for x in train_batch]) actions = np.array([x[1] for x in train_batch]) rewards = np.array([x[2] for x in train_batch]) next_states = np.vstack([x[3] for x in train_batch]) done = np.array([x[4] for x in train_batch]) X = states Q_target = rewards + dis * np.max(targetDQN.predict(next_states), axis=1) * ~done y = mainDQN.predict(states) y[np.arange(len(X)), actions] = Q_target # Train our network using target and predicted Q values on each episode return mainDQN.update(X, y)
def train_sample_replay(dqn: m.DQN, train_batch: list) -> float: """Prepare X_batch, y_batch and train them Recall our loss function is target = reward + discount * max Q(s',a) or reward (if done early) Loss function: [target - Q(s, a)]^2 Hence, X_batch is a state list y_batch is reward + discount * max Q or reward (if terminated early) Args: dqn (m.DQN): DQN Agent to train & run train_batch (list): Mini batch of "Sample" Replay memory Each element is a tuple of (state, action, reward, next_state, done) Returns: loss: Returns a loss """ x_stack = np.empty(0).reshape(0, dqn.input_size) y_stack = np.empty(0).reshape(0, dqn.output_size) # get stored information from the buffer for state, action, reward, next_state, done in train_batch: Q = dqn.predict(state) if done: # terminal? Q[0, action] = reward else: Q[0, action] = reward + dis * np.max(dqn.predict(next_state)) x_stack = np.vstack([x_stack, state]) y_stack = np.vstack([y_stack, Q]) # Train our network using target and predicted Q values on each episode cost, _ = dqn.update(x_stack, y_stack) return cost
def bot_play(mainDQN: m.DQN) -> None: """Runs a single episode with rendering and prints a reward Args: mainDQN (dqn.DQN): DQN Agent """ state = env.reset() reward_sum = 0 while True: env.render() action = np.argmax(mainDQN.predict(state)) state, reward, done, _ = env.step(action) reward_sum += reward if done: print("Total score: {}".format(reward_sum)) break
def bot_play(mainDQN: dqn.DQN, env: gym.Env) -> None: """Test runs with rendering and prints the total score Args: mainDQN (dqn.DQN): DQN agent to run a test env (gym.Env): Gym Environment """ state = env.reset() reward_sum = 0 while True: env.render() action = np.argmax(mainDQN.predict(state)) state, reward, done, _ = env.step(action) reward_sum += reward if done: print("Total score: {}".format(reward_sum)) break
def bot_play(mainDQN: m.DQN) -> None: """Runs a single episode with rendering and prints a reward Args: mainDQN (dqn.DQN): DQN Agent """ # 초기값 설정 state = env.reset() reward_sum = 0 # 실행 while True: # 환경을 화면으로 Display env.render() # 다음 Action을 예측 action = np.argmax(mainDQN.predict(state)) # 환경에 Action을 입력 state, reward, done, _ = env.step(action) reward_sum += reward # 종료되었거나, 10000번 이상 움직였으면 if done or reward_sum > 10000: print("Total score: {}".format(reward_sum)) break