Exemple #1
0
    hash_p = hash_p[:, 1:]
    hash_n = hash_n[:, 1:]

    #### loss
    tri_loss = function.triplet_margin_loss(hash_o, hash_p, hash_n)

    tmp_prob = (function.log_porb(hash_o)) / (bit_len)
    loss_L = torch.mean(tmp_prob * tri_loss.detach())

    loss_R = torch.mean(tri_loss)

    final_loss = lamda * loss_L + loss_R * (1 - lamda)

    #### update

    model.zero_grad()

    final_loss.backward()

    model.step()

    episode_length += 1
    if episode_length % 20 == 0:
        print(
            str(episode_length) + ' ' + str(final_loss.item()) + " " +
            str(loss_L.item()) + " " + str(loss_R.item()) + "\n")
        file = open(logpath, "a")
        file.write(
            str(episode_length) + ' ' + str(final_loss.item()) + " " +
            str(loss_L.item()) + " " + str(loss_R.item()) + "\n")
        file.close()
                    b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                    b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                    b_obs = np.array(batch[:, 3].tolist(),
                                     dtype=np.float32).reshape(batch_size, -1)
                    b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)

                    q = Q(torch.from_numpy(b_pobs))[0]
                    q_ = Q_ast(torch.from_numpy(b_obs))[0]

                    maxq = np.max(q_.data.numpy(), axis=1)
                    target = copy.deepcopy(q.data)
                    for j in range(batch_size):
                        target[j,
                               b_pact[j]] = b_reward[j] + gamma * maxq[j] * (
                                   not b_done[j])
                    Q.zero_grad()
                    loss = loss_function(q, target)
                    total_loss += loss.data.item()
                    loss.backward()
                    optimizer.step()

            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)

            # epsilon
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease

            # next step
            total_reward += reward
            pobs = obs