def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN,
                 train_batch: list) -> float:
    """Trains `mainDQN` with target Q values given by `targetDQN`

    Args:
        mainDQN (dqn.DQN): Main DQN that will be trained
        targetDQN (dqn.DQN): Target DQN that will predict Q_target
        train_batch (list): Minibatch of replay memory
            Each element is (s, a, r, s', done)
            [(state, action, reward, next_state, done), ...]

    Returns:
        float: After updating `mainDQN`, it returns a `loss`
    """
    states = np.vstack([x[0] for x in train_batch])
    actions = np.array([x[1] for x in train_batch])
    rewards = np.array([x[2] for x in train_batch])
    next_states = np.vstack([x[3] for x in train_batch])
    done = np.array([x[4] for x in train_batch])

    X = states

    Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states),
                                                axis=1) * ~done

    y = mainDQN.predict(states)
    y[np.arange(len(X)), actions] = Q_target

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(X, y)
Beispiel #2
0
def train_sample_replay1(mainDQN: m.DQN, targetDQN: m.DQN,
                         train_batch: list) -> float:
    """Prepare X_batch, y_batch and train them

    Recall our loss function is
        target = reward + discount * max Q(s',a)
                 or reward (if done early)
        Loss function: [target - Q(s, a)]^2

    Hence,
        X_batch is a state list
        y_batch is reward + discount * max Q
                   or reward (if terminated early)

    Args:
        mainDQN (m.DQN): DQN Agent to train & run
        targetDQN (m.DQN): DQN Agent to set target Q value
        train_batch (list): Mini batch of "Sample" Replay memory
            Each element is a tuple of (state, action, reward, next_state, done)

    Returns:
        loss: Returns a loss
    """
    x_stack = np.empty(0).reshape(0, mainDQN.input_size)
    y_stack = np.empty(0).reshape(0, mainDQN.output_size)

    # get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        # 학습시키고자 하는 Q값 (Q 목표값)
        Q = mainDQN.predict(state)  # 현재 상태의 Q 값 # todo: target DQN 이용

        if done:  # Q 목표값 update
            Q[0, action] = reward
        else:
            Q[0, action] = reward \
                           + dis * np.max(targetDQN.
                                          predict(next_state))

        x_stack = np.vstack([x_stack, state])
        y_stack = np.vstack([y_stack, Q])

    # Train our network using target and predicted Q values on each episode
    # DQN 2015!!!
    # Q 목표값은 target DQN을 이용해서 얻고,
    # Q-net(weight)의 update는 main DQN을 이용해서 한다.
    cost, _ = mainDQN.update(x_stack, y_stack)

    return cost
Beispiel #3
0
def train_sample_replay(mainDQN: m.DQN, targetDQN: m.DQN,
                        train_batch: list) -> float:
    states = np.vstack([x[0] for x in train_batch])
    actions = np.array([x[1] for x in train_batch])
    rewards = np.array([x[2] for x in train_batch])
    next_states = np.vstack([x[3] for x in train_batch])
    done = np.array([x[4] for x in train_batch])

    X = states

    Q_target = rewards + dis * np.max(targetDQN.predict(next_states),
                                      axis=1) * ~done

    y = mainDQN.predict(states)
    y[np.arange(len(X)), actions] = Q_target

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(X, y)
Beispiel #4
0
def train_sample_replay(dqn: m.DQN, train_batch: list) -> float:
    """Prepare X_batch, y_batch and train them

    Recall our loss function is
        target = reward + discount * max Q(s',a)
                 or reward (if done early)
        Loss function: [target - Q(s, a)]^2

    Hence,
        X_batch is a state list
        y_batch is reward + discount * max Q
                   or reward (if terminated early)

    Args:
        dqn (m.DQN): DQN Agent to train & run
        train_batch (list): Mini batch of "Sample" Replay memory
            Each element is a tuple of (state, action, reward, next_state, done)

    Returns:
        loss: Returns a loss
    """
    x_stack = np.empty(0).reshape(0, dqn.input_size)
    y_stack = np.empty(0).reshape(0, dqn.output_size)

    # get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = dqn.predict(state)

        if done:  # terminal?
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(dqn.predict(next_state))

        x_stack = np.vstack([x_stack, state])
        y_stack = np.vstack([y_stack, Q])

    # Train our network using target and predicted Q values on each episode
    cost, _ = dqn.update(x_stack, y_stack)

    return cost
Beispiel #5
0
def bot_play(mainDQN: m.DQN) -> None:
    """Runs a single episode with rendering and prints a reward

    Args:
        mainDQN (dqn.DQN): DQN Agent
    """
    state = env.reset()
    reward_sum = 0
    while True:
        env.render()
        action = np.argmax(mainDQN.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break
def bot_play(mainDQN: dqn.DQN, env: gym.Env) -> None:
    """Test runs with rendering and prints the total score

    Args:
        mainDQN (dqn.DQN): DQN agent to run a test
        env (gym.Env): Gym Environment
    """
    state = env.reset()
    reward_sum = 0

    while True:

        env.render()
        action = np.argmax(mainDQN.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            print("Total score: {}".format(reward_sum))
            break
Beispiel #7
0
def bot_play(mainDQN: m.DQN) -> None:
    """Runs a single episode with rendering and prints a reward

    Args:
        mainDQN (dqn.DQN): DQN Agent
    """
    # 초기값 설정
    state = env.reset()
    reward_sum = 0

    # 실행
    while True:
        # 환경을 화면으로 Display
        env.render()
        # 다음 Action을 예측
        action = np.argmax(mainDQN.predict(state))
        # 환경에 Action을 입력
        state, reward, done, _ = env.step(action)
        reward_sum += reward
        # 종료되었거나, 10000번 이상 움직였으면
        if done or reward_sum > 10000:
            print("Total score: {}".format(reward_sum))
            break