Ejemplo n.º 1
0
def simple_replay_train(mainDQN, targetDQN, train_batch, optimizer):
    Q_val_List = []
    Q_target_val_List = []

    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN(convertToTensorInput(state, input_size))
        Q1 = targetDQN(convertToTensorInput(next_state, input_size))
        maxQ1 = torch.max(Q1.data)

        if done:
            Q1[0, action] = reward
        else:
            Q1[0, action] = reward + torch.mul(maxQ1, dis)

        Q_val_List.append(Q)
        Q_target_val_List.append(Q1)

    Q_val_List = torch.stack(Q_val_List).squeeze(1)
    Q_target_val_List = torch.stack(Q_target_val_List).squeeze(1)
    loss = torch.mean((Q_val_List - Q_target_val_List)**2)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss
def running():
    max_episode = 5000
    replay_buffer = deque()

    mainDQN = DQN(input_size, output_size)
    targetDQN = DQN(input_size, output_size)

    update_target(mainDQN, targetDQN)

    optimizer = torch.optim.Adam(mainDQN.parameters(), lr=1e-1)

    for episode in range(max_episode):
        e = 1 / ((episode / 10) + 1)
        done = False
        step_count = 0

        state = env.reset()

        while not done:
            # e-greedy 기법
            if np.random.rand(1) < e:
                action = env.action_space.sample()
            else:
                q_val = mainDQN(convertToTensorInput(state, input_size))
                _, action = torch.max(q_val, 1)
                action = action.data[0].item()

            # gym으로 부터 정보 받아옴
            next_state, reward, done, _ = env.step(action)

            # 정보를 쌓아 둠
            replay_buffer.append((state, action, reward, next_state, done))
            if len(replay_buffer) > REPLAY_MEMORY:
                replay_buffer.popleft()

            state = next_state
            step_count += 1

            if step_count > 10000:
                break
            env.render()

        print("Episode: {} steps: {}".format(episode, step_count))
        if step_count > 10000:
            pass

        # 10회 주기로 미니배칭하여 타켓 신경망 업데이트
        if episode % 30 == 1:
            for _ in range(50):
                minibatch = random.sample(replay_buffer, 30)
                loss = simple_replay_train(mainDQN, targetDQN, minibatch,
                                           optimizer)
            print("Loss", loss.item())
            update_target(mainDQN, targetDQN)

    bot_play(mainDQN)

    env.close()
Ejemplo n.º 3
0
dis = 0.9
rList = []

model = RL_MODEL()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for i in range(num_episode):
    e = 1. / ((1 / 10) / +1)
    rAll = 0
    step_count = 0
    s = env.reset()
    done = False

    while not done:
        step_count += 1
        Qs = model(convertToTensorInput(s, input_size))

        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            _, action = torch.max(Qs, 1)
            action = action.data[0].item()

        new_state, reward, done, _ = env.step(action)

        Q1 = model(convertToTensorInput(new_state, input_size))
        maxQ1 = torch.max(Q1.data)

        targetQ = Variable(Qs.data, requires_grad=False)

        if done: