hash_p = hash_p[:, 1:] hash_n = hash_n[:, 1:] #### loss tri_loss = function.triplet_margin_loss(hash_o, hash_p, hash_n) tmp_prob = (function.log_porb(hash_o)) / (bit_len) loss_L = torch.mean(tmp_prob * tri_loss.detach()) loss_R = torch.mean(tri_loss) final_loss = lamda * loss_L + loss_R * (1 - lamda) #### update model.zero_grad() final_loss.backward() model.step() episode_length += 1 if episode_length % 20 == 0: print( str(episode_length) + ' ' + str(final_loss.item()) + " " + str(loss_L.item()) + " " + str(loss_R.item()) + "\n") file = open(logpath, "a") file.write( str(episode_length) + ' ' + str(final_loss.item()) + " " + str(loss_L.item()) + " " + str(loss_R.item()) + "\n") file.close()
b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32) b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32) b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1) b_done = np.array(batch[:, 4].tolist(), dtype=np.bool) q = Q(torch.from_numpy(b_pobs))[0] q_ = Q_ast(torch.from_numpy(b_obs))[0] maxq = np.max(q_.data.numpy(), axis=1) target = copy.deepcopy(q.data) for j in range(batch_size): target[j, b_pact[j]] = b_reward[j] + gamma * maxq[j] * ( not b_done[j]) Q.zero_grad() loss = loss_function(q, target) total_loss += loss.data.item() loss.backward() optimizer.step() if total_step % update_q_freq == 0: Q_ast = copy.deepcopy(Q) # epsilon if epsilon > epsilon_min and total_step > start_reduce_epsilon: epsilon -= epsilon_decrease # next step total_reward += reward pobs = obs