def get_word_translations(emb1, emb2, knn, softmax_temp=30.): """ Given source and target word embeddings, and a list of source words, produce a list of lists of k-best translations for each source word. """ # normalize word embeddings emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1) emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2) # we always use the contextual dissimilarity measure as this gives the best performance (csls_knn_10) # calculate the average distances to k nearest neighbors average_dist1 = get_nn_avg_dist(emb2, emb1, knn) average_dist2 = get_nn_avg_dist(emb1, emb2, knn) average_dist1 = torch.from_numpy(average_dist1).type_as(emb1) average_dist2 = torch.from_numpy(average_dist2).type_as(emb2) top_k_match_ids = [] step_size = 1000 for i in range(0, emb1.shape[0], step_size): print('Processing word ids %d-%d...' % (i, i + step_size)) word_ids = range(i, i + step_size) # use the embeddings of the current word ids query = emb1[word_ids] # calculate the scores with the contextual dissimilarity measure scores = query.mm(emb2.transpose(0, 1)) scores.mul_(2) scores.sub_(average_dist1[word_ids][:, None] + average_dist2[None, :]) # get the indices of the highest scoring target words top_sim_scores, top_match_ids = scores.topk( knn, 1, True) # returns a (values, indices) tuple (same as torch.topk) top_sim_scores = F.softmax(softmax_temp * top_sim_scores, 1) top_k_match_ids += [ (ids, scores) for ids, scores in zip(top_match_ids, top_sim_scores) ] return top_k_match_ids
def get_sent_translation_accuracy(data, labels, lg1, word2id1, emb1, lg2, word2id2, emb2, method, idf, test, device=2): """ Given parallel sentences from Europarl, evaluate the sentence translation accuracy using the precision@k. """ # get word vectors dictionaries emb1 = emb1.cpu().numpy() emb2 = emb2.cpu().numpy() word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1]) word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2]) word_vect = {lg1: word_vec1, lg2: word_vec2} lg_keys = lg2 lg_query = lg1 # get n_keys pairs of sentences src_keys = torch.arange(len(data[lg1])) tgt_keys = torch.arange(len(data[lg2])) keys = data[lg_keys] key_ids, keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys]) # get n_queries query pairs from these n_keys pairs rng = np.random.RandomState(1234) queries = [data[lg_query][i.item()] for i in src_keys] query_ids, queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query]) # normalize embeddings queries = torch.from_numpy(queries).float() queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries) keys = torch.from_numpy(keys).float() keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys) keys = keys.to(device) # nearest neighbors if method == 'nn': top2 = top2_scores(queries, keys, 1500, device=device) pickle.dump(top2, open('fr-en.sample.scores', 'wb')) # contextual dissimilarity measure elif method.startswith('csls_knn_'): knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) # average distances to k nearest neighbors knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) average_dist_keys = torch.from_numpy( get_nn_avg_dist(queries, keys, knn)).to(device) average_dist_queries = torch.from_numpy( get_nn_avg_dist(keys, queries, knn)).to(device) # scores top2 = top2_scores_csls(queries, keys, 1000, average_dist_keys, average_dist_queries, device=device) pickle.dump(top2, open('fr-en.sample.scores', 'wb')) # scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) # scores.mul_(2) # scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float()) # scores = scores.cpu() results = [] top_matches = scores.topk(10, 1, True)[1] predictions = top_matches[:, 0] if not test: for k in [1, 5, 10]: top_k_matches = (top_matches[:, :k] == tgt_keys[:, None]).sum(1) precision_at_k = 100 * top_k_matches.float().numpy().mean() logger.info("%i queries (%s) - %s - Precision at k = %i: %f" % (len(top_k_matches), lg_query.upper(), method, k, precision_at_k)) results.append(('sent-precision_at_%i' % k, precision_at_k)) return predictions, results
def get_sent_translation_accuracy(data, lg1, word2id1, emb1, lg2, word2id2, emb2, n_keys, n_queries, method, idf): """ Given parallel sentences from Europarl, evaluate the sentence translation accuracy using the precision@k. """ # get word vectors dictionaries emb1 = emb1.cpu().numpy() emb2 = emb2.cpu().numpy() word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1]) word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2]) word_vect = {lg1: word_vec1, lg2: word_vec2} lg_keys = lg2 lg_query = lg1 # get n_keys pairs of sentences keys = data[lg_keys][:n_keys] keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys]) # get n_queries query pairs from these n_keys pairs rng = np.random.RandomState(1234) idx_query = rng.choice(range(n_keys), size=n_queries, replace=False) queries = data[lg_query][idx_query] queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query]) # normalize embeddings queries = torch.from_numpy(queries).float() queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries) keys = torch.from_numpy(keys).float() keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys) # nearest neighbors if method == 'nn': scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) scores = scores.cpu() # inverted softmax elif method.startswith('invsm_beta_'): beta = float(method[len('invsm_beta_'):]) scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) scores.mul_(beta).exp_() scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) scores = scores.cpu() # contextual dissimilarity measure elif method.startswith('csls_knn_'): knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) # average distances to k nearest neighbors knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) average_dist_keys = torch.from_numpy(get_nn_avg_dist(queries, keys, knn)) average_dist_queries = torch.from_numpy(get_nn_avg_dist(keys, queries, knn)) # scores scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) scores.mul_(2) scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float()) scores = scores.cpu() results = [] top_matches = scores.topk(10, 1, True)[1] for k in [1, 5, 10]: top_k_matches = (top_matches[:, :k] == torch.from_numpy(idx_query)[:, None]).sum(1) precision_at_k = 100 * np.mean(top_k_matches.float().mean()) logger.info("%i queries (%s) - %s - Precision at k = %i: %f" % (len(top_k_matches), lg_query.upper(), method, k, precision_at_k)) results.append(('sent-precision_at_%i' % k, precision_at_k)) return results
def get_word_translation_accuracy(dico, word2id1, emb1, word2id2, emb2, method): """ Given source and target word embeddings, and a dictionary, evaluate the translation accuracy using the precision@k. """ dico = dico.cuda() if emb1.is_cuda else dico assert dico[:, 0].max() < emb1.size(0) assert dico[:, 1].max() < emb2.size(0) # normalize word embeddings emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1) emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2) # nearest neighbors if method == 'nn': query = emb1[dico[:, 0]] scores = query.mm(emb2.transpose(0, 1)) # inverted softmax elif method.startswith('invsm_beta_'): beta = float(method[len('invsm_beta_'):]) bs = 128 word_scores = [] for i in range(0, emb2.size(0), bs): scores = emb1.mm(emb2[i:i + bs].transpose(0, 1)) scores.mul_(beta).exp_() scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) word_scores.append(scores.index_select(0, dico[:, 0])) scores = torch.cat(word_scores, 1) # contextual dissimilarity measure elif method.startswith('csls_knn_'): # average distances to k nearest neighbors knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) average_dist1 = get_nn_avg_dist(emb2, emb1, knn) average_dist2 = get_nn_avg_dist(emb1, emb2, knn) average_dist1 = torch.from_numpy(average_dist1).type_as(emb1) average_dist2 = torch.from_numpy(average_dist2).type_as(emb2) # queries / scores query = emb1[dico[:, 0]] scores = query.mm(emb2.transpose(0, 1)) scores.mul_(2) scores.sub_(average_dist1[dico[:, 0]][:, None] + average_dist2[None, :]) else: raise Exception('Unknown method: "%s"' % method) results = [] top_matches = scores.topk(100, 1, True)[1] for k in [1, 5, 10]: top_k_matches = top_matches[:, :k] _matching = ( top_k_matches == dico[:, 1][:, None].expand_as(top_k_matches)).sum(1) # allow for multiple possible translations matching = {} for i, src_id in enumerate(dico[:, 0]): matching[src_id] = min(matching.get(src_id, 0) + _matching[i], 1) # evaluate precision@k precision_at_k = 100 * np.mean(list(matching.values())) print("%i source words - %s - Precision at k = %i: %f" % (len(matching), method, k, precision_at_k)) results.append(('precision_at_%i' % k, precision_at_k)) return results
default="_", help= "Replace phrase word delimiter by empty space (empty string to disable)") params = parser.parse_args() # read embeddings print("Loading embeddings ...") src_dico, src_emb = load_embeddings(params, source=True) tgt_dico, tgt_emb = load_embeddings(params, source=False) n_src = src_emb.size(0) n_tgt = tgt_emb.size(0) print("Loaded %i / %i source / target embeddings." % (n_src, n_tgt)) # use CSLS print("Computing average distance ...") src_avg_dist = get_nn_avg_dist(emb=tgt_emb, query=src_emb, knn=10) if params.csls else None tgt_avg_dist = get_nn_avg_dist(emb=src_emb, query=tgt_emb, knn=10) if params.csls else None # get translations print("Generating translations ...") s2t_translations = get_translations(src_emb, tgt_emb, src_avg_dist, tgt_avg_dist, N_TRANSLATE) if params.inverse_score: t2s_translations = get_translations(tgt_emb, src_emb, tgt_avg_dist, src_avg_dist, N_TRANSLATE) # get scores print("Generating scores ...") s2t_scores = get_s2t_scores(src_emb, tgt_emb, s2t_translations, params.temperature)