loss = reward - np.amax(Q_out) if not done: new_state = prepare_state(new_state) Q_out = Q(FloatTensor(new_state)).to('cpu').detach().numpy()[0] Q_targ_out = Q_targ( FloatTensor(new_state)).to('cpu').detach().numpy()[0] loss += gamma * Q_targ_out[np.argmax(Q_out)] loss = abs(loss) #print(curr_state.shape) replay_mem.add_element((curr_state, action, reward, new_state), loss) curr_state = new_state # #Learning # sarses = replay_mem.get_batch(batch_size) #Targets Q_true = [] #print() #print('+++++++++++++++++++++++++++')