def csls_knn_10_score(emb_trans, emb_tgt, dico): emb_trans = emb_trans / np.linalg.norm( emb_trans, ord=2, axis=1, keepdims=True) emb_tgt = emb_tgt / np.linalg.norm(emb_tgt, ord=2, axis=1, keepdims=True) emb_trans = emb_trans.astype('float32') emb_tgt = emb_tgt.astype('float32') # I use csls_knn_10 directly average_dist1 = get_nn_avg_dist(emb=emb_tgt, query=emb_trans, knn=10) #(200000,) average_dist2 = get_nn_avg_dist(emb=emb_trans, query=emb_tgt, knn=10) #(200000,) query = emb_trans[ dico[:, 0]] # dico[:, 0] is from source Domain, # dico[:, 1] is from target domain scores = 2 * query.dot(emb_tgt.T) #2975*200000 scores -= average_dist1[dico[:, 0]][:, None] # right hand side: 2975, 1 scores -= average_dist2[None, :] # right hand side: 1, 200000 return scores
def get_sent_translation_accuracy(data, lg1, word2id1, emb1, lg2, word2id2, emb2, n_keys, n_queries, method, idf): """ Given parallel sentences from Europarl, evaluate the sentence translation accuracy using the precision@k. """ # get word vectors dictionaries emb1 = emb1.cpu().numpy() emb2 = emb2.cpu().numpy() word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1]) word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2]) word_vect = {lg1: word_vec1, lg2: word_vec2} lg_keys = lg2 lg_query = lg1 # get n_keys pairs of sentences keys = data[lg_keys][:n_keys] keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys]) # get n_queries query pairs from these n_keys pairs rng = np.random.RandomState(1234) idx_query = rng.choice(range(n_keys), size=n_queries, replace=False) queries = data[lg_query][idx_query] queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query]) # normalize embeddings queries = torch.from_numpy(queries).float() queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries) keys = torch.from_numpy(keys).float() keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys) # nearest neighbors if method == 'nn': scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) scores = scores.cpu() # inverted softmax elif method.startswith('invsm_beta_'): beta = float(method[len('invsm_beta_'):]) scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) scores.mul_(beta).exp_() scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) scores = scores.cpu() # contextual dissimilarity measure elif method.startswith('csls_knn_'): knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) # average distances to k nearest neighbors knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) average_dist_keys = torch.from_numpy( get_nn_avg_dist(queries, keys, knn)) average_dist_queries = torch.from_numpy( get_nn_avg_dist(keys, queries, knn)) # scores scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) scores.mul_(2) scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float()) scores = scores.cpu() results = [] top_matches = scores.topk(10, 1, True)[1] for k in [1, 5, 10]: top_k_matches = ( top_matches[:, :k] == torch.from_numpy(idx_query)[:, None]).sum(1) precision_at_k = 100 * np.mean(top_k_matches.float().mean()) logger.info( "%i queries (%s) - %s - Precision at k = %i: %f" % (len(top_k_matches), lg_query.upper(), method, k, precision_at_k)) results.append(('sent-precision_at_%i' % k, precision_at_k)) return results
def get_candidates(emb1, emb2, params): """ Get best translation pairs candidates. """ bs = 128 all_scores = [] all_targets = [] # number of source words to consider n_src = emb1.size(0) if params.dico_max_rank > 0 and not params.dico_method.startswith( 'invsm_beta_'): n_src = params.dico_max_rank # nearest neighbors if params.dico_method == 'nn': # for every source word for i in range(0, n_src, bs): # compute target words scores scores = emb2.mm(emb1[i:min(n_src, i + bs)].transpose( 0, 1)).transpose(0, 1) best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) # update scores / potential targets all_scores.append(best_scores.cpu()) all_targets.append(best_targets.cpu()) all_scores = torch.cat(all_scores, 0) all_targets = torch.cat(all_targets, 0) # inverted softmax elif params.dico_method.startswith('invsm_beta_'): beta = float(params.dico_method[len('invsm_beta_'):]) # for every target word for i in range(0, emb2.size(0), bs): # compute source words scores scores = emb1.mm(emb2[i:i + bs].transpose(0, 1)) scores.mul_(beta).exp_() scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) # update scores / potential targets all_scores.append(best_scores.cpu()) all_targets.append((best_targets + i).cpu()) all_scores = torch.cat(all_scores, 1) all_targets = torch.cat(all_targets, 1) all_scores, best_targets = all_scores.topk(2, dim=1, largest=True, sorted=True) all_targets = all_targets.gather(1, best_targets) # contextual dissimilarity measure elif params.dico_method.startswith('csls_knn_'): knn = params.dico_method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) # average distances to k nearest neighbors average_dist1 = torch.from_numpy(get_nn_avg_dist(emb2, emb1, knn)) average_dist2 = torch.from_numpy(get_nn_avg_dist(emb1, emb2, knn)) average_dist1 = average_dist1.type_as(emb1) average_dist2 = average_dist2.type_as(emb2) # for every source word for i in range(0, n_src, bs): # compute target words scores scores = emb2.mm(emb1[i:min(n_src, i + bs)].transpose( 0, 1)).transpose(0, 1) scores.mul_(2) scores.sub_(average_dist1[i:min(n_src, i + bs)][:, None] + average_dist2[None, :]) best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) # update scores / potential targets all_scores.append(best_scores.cpu()) all_targets.append(best_targets.cpu()) all_scores = torch.cat(all_scores, 0) all_targets = torch.cat(all_targets, 0) all_pairs = torch.cat([ torch.arange(0, all_targets.size(0)).long().unsqueeze(1), all_targets[:, 0].unsqueeze(1) ], 1) # sanity check assert all_scores.size() == all_pairs.size() == (n_src, 2) # sort pairs by score confidence diff = all_scores[:, 0] - all_scores[:, 1] reordered = diff.sort(0, descending=True)[1] all_scores = all_scores[reordered] all_pairs = all_pairs[reordered] # max dico words rank if params.dico_max_rank > 0: selected = all_pairs.max(1)[0] <= params.dico_max_rank mask = selected.unsqueeze(1).expand_as(all_scores).clone() all_scores = all_scores.masked_select(mask).view(-1, 2) all_pairs = all_pairs.masked_select(mask).view(-1, 2) # max dico size if params.dico_max_size > 0: all_scores = all_scores[:params.dico_max_size] all_pairs = all_pairs[:params.dico_max_size] # min dico size diff = all_scores[:, 0] - all_scores[:, 1] if params.dico_min_size > 0: diff[:params.dico_min_size] = 1e9 # confidence threshold if params.dico_threshold > 0: mask = diff > params.dico_threshold logger.info("Selected %i / %i pairs above the confidence threshold." % (mask.sum(), diff.size(0))) mask = mask.unsqueeze(1).expand_as(all_pairs).clone() all_pairs = all_pairs.masked_select(mask).view(-1, 2) return all_pairs
def get_candidates(emb1, emb2, args): bs = 128 all_scores = [] all_targets = [] n_src = emb1.size(0) if args.dico_method == 'nn': for i in range(0, n_src, bs): scores = emb2.mm(emb1[i:min(n_src, i + bs)] .transpose(0, 1)).transpose(0, 1) best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) all_scores.append(best_scores.cpu()) all_targets.append(best_targets.cpu()) # all_scores: for each emb1 vocab, contains top-2 "close" word's scores all_scores = torch.cat(all_scores, 0) # all_targets: # for each emb1 vocab, contains top-2 "close" word's indices all_targets = torch.cat(all_targets, 0) if args.dico_method.startswith('csls_knn_'): knn = args.dico_method[len('csls_knn_'):] knn = int(knn) ave_dist1 = torch.from_numpy(get_nn_avg_dist(emb2, emb1, knn)) ave_dist2 = torch.from_numpy(get_nn_avg_dist(emb1, emb2, knn)) ave_dist1 = ave_dist1.type_as(emb1) ave_dist2 = ave_dist2.type_as(emb2) for i in range(0, n_src, bs): scores = emb2.mm((emb1[i:min(n_src, i + bs)]) .transpose(0, 1)).transpose(0, 1) scores.mul_(2) scores.sub_(ave_dist1[i:min(n_src, i + bs)][:, None] + ave_dist2[None, :]) best_scores, best_targets =\ scores.topk(2, dim=1, largest=True, sorted=True) all_scores.append(best_scores.cpu()) all_targets.append(best_targets.cpu()) all_scores = torch.cat(all_scores, 0) all_targets = torch.cat(all_targets, 0) all_pairs = torch.cat([ torch.arange(0, all_targets.size(0)).long().unsqueeze(1), all_targets[:, 0].unsqueeze(1) ], 1) assert all_scores.size() == all_pairs.size() == (n_src, 2) # sort paris by score confidence diff = all_scores[:, 0] - all_scores[:, 1] reordered = diff.sort(0, descending=True)[1] all_scores = all_scores[reordered] all_pairs = all_pairs[reordered] if args.dico_max_rank > 0: selected = all_pairs.max(1)[0] <= args.dico_max_rank mask = selected.unsqueeze(1).expand_as(all_scores).clone() all_scores = all_scores.masked_select(mask).view(-1, 2) all_pairs = all_pairs.masked_select(mask).view(-1, 2) if args.dico_max_size > 0: all_scores = all_scores[:args.dico_max_size] all_pairs = all_pairs[:args.dico_max_size] diff = all_scores[:, 0] - all_scores[:, 1] if args.dico_min_size > 0: diff[:args.dico_min_size] = 1e9 if args.dico_threshold > 0: mask = diff > args.dico_threshold mask = mask.unsqueeze(1).expand_as(all_pairs).clone() all_pairs = all_pairs.masked_select(mask).view(-1, 2) return all_pairs
def get_word_translation_accuracy(lang1, word2id1, emb1, lang2, word2id2, emb2, method): """ Given source and target word embeddings, and a dictionary, evaluate the translation accuracy using the precision@k. """ path = os.path.join(DIC_EVAL_PATH, '%s-%s.txt' % (lang1, lang2)) dico = load_dictionary(path, word2id1, word2id2) dico = dico.cuda() if emb1.is_cuda else dico assert dico[:, 0].max() < emb1.size(0) assert dico[:, 1].max() < emb2.size(0) # normalize word embeddings emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1) emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2) # nearest neighbors if method == 'nn': query = emb1[dico[:, 0]] scores = query.mm(emb2.transpose(0, 1)) # inverted softmax elif method.startswith('invsm_beta_'): beta = float(method[len('invsm_beta_'):]) bs = 128 word_scores = [] for i in range(0, emb2.size(0), bs): scores = emb1.mm(emb2[i:i + bs].transpose(0, 1)) scores.mul_(beta).exp_() scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) word_scores.append(scores.index_select(0, dico[:, 0])) scores = torch.cat(word_scores, 1) # contextual dissimilarity measure elif method.startswith('csls_knn_'): # average distances to k nearest neighbors knn = method[len('csls_knn_'):] assert knn.isdigit() knn = int(knn) average_dist1 = get_nn_avg_dist(emb2, emb1, knn) average_dist2 = get_nn_avg_dist(emb1, emb2, knn) average_dist1 = torch.from_numpy(average_dist1).type_as(emb1) average_dist2 = torch.from_numpy(average_dist2).type_as(emb2) # queries / scores query = emb1[dico[:, 0]] scores = query.mm(emb2.transpose(0, 1)) scores.mul_(2) scores.sub_(average_dist1[dico[:, 0]][:, None] + average_dist2[None, :]) else: raise Exception('Unknown method: "%s"' % method) results = [] top_matches = scores.topk(100, 1, True)[1] for k in [1, 5, 10]: top_k_matches = top_matches[:, :k] _matching = (top_k_matches == dico[:, 1][:, None].expand_as(top_k_matches)).sum(1) # allow for multiple possible translations matching = {} for i, src_id in enumerate(dico[:, 0]): matching[src_id] = min(matching.get(src_id, 0) + _matching[i], 1) # evaluate precision@k precision_at_k = 100 * np.mean(list(matching.values())) logger.info("%i source words - %s - Precision at k = %i: %f" % (len(matching), method, k, precision_at_k)) results.append(('precision_at_%i' % k, precision_at_k)) return results