def get_nearest_neighbors(w, embeddings, k=1000): """ For every transition in every pattern, gets the word with the highest score for that transition. Only looks at the first `k` words in the vocab (makes sense, assuming they're sorted by descending frequency). """ return argmax(torch.mm(w, embeddings[:k, :]))
def interpret_documents(model, batch_size, dev_data, dev_text, ofile, max_doc_len): j = 0 with open(ofile, "w") as ofh: for batch_idx, chunk in enumerate(chunked(dev_data, batch_size)): batch = Batch([x for x, y in chunk], model.embeddings, model.to_cuda) res, scores = model.forward(batch, 1) print("ss", scores.size()) output = softmax(res).data predictions = [int(x) for x in argmax(output)] num_patts = scores.size()[1] diffs = np.zeros((num_patts, batch.size())) # Traversing all patterns. for i in range(num_patts): # Copying scores data to numpy array. scores_data = np.array(scores.data.numpy(), copy=True) # Zeroing out pattern number i across batch scores_data[:, i] = 0 # Running mlp.forward() with zeroed out scores. forwarded = softmax( model.mlp.forward(Variable( torch.FloatTensor(scores_data)))).data.numpy() # Computing difference between forwarded scores and original scores. for k in range(batch.size()): # diffs[i,k] = output[k, predictions[k]] - \ # output[k, 1 - predictions[k]] - \ # forwarded[k, predictions[k]] + \ # forwarded[k, 1 - predictions[k]] diffs[i, k] = forwarded[k, 1 - predictions[k]] - output[ k, 1 - predictions[k]] # Now, traversing documents in batch for i in range(batch.size()): # Document string text_str = str(" ".join(dev_text[j]).encode('utf-8'))[2:-1] # Top ten patterns with largest differences between leave-one-out score and original score. top_ten_deltas = sorted(enumerate(diffs[:, i]), key=lambda x: x[1], reverse=True)[:10] top_ten_neg_deltas = sorted(enumerate(diffs[:, i]), key=lambda x: x[1])[:10] # Top ten patterns with largest overall score (regardless of classification) top_ten_scores = sorted(enumerate(scores.data.numpy()[i, :]), key=lambda x: x[1], reverse=True)[:10] top_scoring_spans = get_top_scoring_spans_for_doc( model, dev_data[j], max_doc_len) # Printing out everything. ofh.write( "{} {} {} All in, predicted: {:>2,.3f} All in, not-predicted: {:>2,.3f} Leave one out: +res: {} -res: {} Patt scores: {}\n" .format( dev_data[j][1], predictions[i], text_str, output[i, predictions[i]], output[i, 1 - predictions[i]], " ".join([ "{}:{:>2,.3f}".format(i, x) for (i, x) in top_ten_deltas ]), " ".join([ "{}:{:>2,.3f}".format(i, x) for (i, x) in top_ten_neg_deltas ]), " ".join([ "{}:{:>2,.3f}".format(i, x) for (i, x) in top_ten_scores ]))) ofh.write("Top ten deltas:\n") for l in top_ten_deltas: s = top_scoring_spans[l[0]].display(dev_text[j]) ofh.write( str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] + "\n") ofh.write("Top ten negative deltas:\n") for l in top_ten_neg_deltas: s = top_scoring_spans[l[0]].display(dev_text[j]) ofh.write( str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] + "\n") j += 1