Example #1
0
def csls_knn_10_score(emb_trans, emb_tgt, dico):
    emb_trans = emb_trans / np.linalg.norm(
        emb_trans, ord=2, axis=1, keepdims=True)
    emb_tgt = emb_tgt / np.linalg.norm(emb_tgt, ord=2, axis=1, keepdims=True)
    emb_trans = emb_trans.astype('float32')
    emb_tgt = emb_tgt.astype('float32')
    # I use csls_knn_10 directly
    average_dist1 = get_nn_avg_dist(emb=emb_tgt, query=emb_trans,
                                    knn=10)  #(200000,)
    average_dist2 = get_nn_avg_dist(emb=emb_trans, query=emb_tgt,
                                    knn=10)  #(200000,)

    query = emb_trans[
        dico[:,
             0]]  # dico[:, 0] is from source Domain, # dico[:, 1] is from target domain
    scores = 2 * query.dot(emb_tgt.T)  #2975*200000
    scores -= average_dist1[dico[:, 0]][:, None]  # right hand side: 2975, 1
    scores -= average_dist2[None, :]  # right hand side: 1, 200000

    return scores
def get_sent_translation_accuracy(data, lg1, word2id1, emb1, lg2, word2id2,
                                  emb2, n_keys, n_queries, method, idf):
    """
    Given parallel sentences from Europarl, evaluate the
    sentence translation accuracy using the precision@k.
    """
    # get word vectors dictionaries
    emb1 = emb1.cpu().numpy()
    emb2 = emb2.cpu().numpy()
    word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1])
    word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2])
    word_vect = {lg1: word_vec1, lg2: word_vec2}
    lg_keys = lg2
    lg_query = lg1

    # get n_keys pairs of sentences
    keys = data[lg_keys][:n_keys]
    keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys])

    # get n_queries query pairs from these n_keys pairs
    rng = np.random.RandomState(1234)
    idx_query = rng.choice(range(n_keys), size=n_queries, replace=False)
    queries = data[lg_query][idx_query]
    queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query])

    # normalize embeddings
    queries = torch.from_numpy(queries).float()
    queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries)
    keys = torch.from_numpy(keys).float()
    keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys)

    # nearest neighbors
    if method == 'nn':
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores = scores.cpu()

    # inverted softmax
    elif method.startswith('invsm_beta_'):
        beta = float(method[len('invsm_beta_'):])
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores.mul_(beta).exp_()
        scores.div_(scores.sum(0, keepdim=True).expand_as(scores))
        scores = scores.cpu()

    # contextual dissimilarity measure
    elif method.startswith('csls_knn_'):
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        # average distances to k nearest neighbors
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        average_dist_keys = torch.from_numpy(
            get_nn_avg_dist(queries, keys, knn))
        average_dist_queries = torch.from_numpy(
            get_nn_avg_dist(keys, queries, knn))
        # scores
        scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1)
        scores.mul_(2)
        scores.sub_(average_dist_queries[:, None].float() +
                    average_dist_keys[None, :].float())
        scores = scores.cpu()

    results = []
    top_matches = scores.topk(10, 1, True)[1]
    for k in [1, 5, 10]:
        top_k_matches = (
            top_matches[:, :k] == torch.from_numpy(idx_query)[:, None]).sum(1)
        precision_at_k = 100 * np.mean(top_k_matches.float().mean())
        logger.info(
            "%i queries (%s) - %s - Precision at k = %i: %f" %
            (len(top_k_matches), lg_query.upper(), method, k, precision_at_k))
        results.append(('sent-precision_at_%i' % k, precision_at_k))

    return results
Example #3
0
def get_candidates(emb1, emb2, params):
    """
    Get best translation pairs candidates.
    """
    bs = 128

    all_scores = []
    all_targets = []

    # number of source words to consider
    n_src = emb1.size(0)
    if params.dico_max_rank > 0 and not params.dico_method.startswith(
            'invsm_beta_'):
        n_src = params.dico_max_rank

    # nearest neighbors
    if params.dico_method == 'nn':

        # for every source word
        for i in range(0, n_src, bs):

            # compute target words scores
            scores = emb2.mm(emb1[i:min(n_src, i + bs)].transpose(
                0, 1)).transpose(0, 1)
            best_scores, best_targets = scores.topk(2,
                                                    dim=1,
                                                    largest=True,
                                                    sorted=True)

            # update scores / potential targets
            all_scores.append(best_scores.cpu())
            all_targets.append(best_targets.cpu())

        all_scores = torch.cat(all_scores, 0)
        all_targets = torch.cat(all_targets, 0)

    # inverted softmax
    elif params.dico_method.startswith('invsm_beta_'):

        beta = float(params.dico_method[len('invsm_beta_'):])

        # for every target word
        for i in range(0, emb2.size(0), bs):

            # compute source words scores
            scores = emb1.mm(emb2[i:i + bs].transpose(0, 1))
            scores.mul_(beta).exp_()
            scores.div_(scores.sum(0, keepdim=True).expand_as(scores))

            best_scores, best_targets = scores.topk(2,
                                                    dim=1,
                                                    largest=True,
                                                    sorted=True)

            # update scores / potential targets
            all_scores.append(best_scores.cpu())
            all_targets.append((best_targets + i).cpu())

        all_scores = torch.cat(all_scores, 1)
        all_targets = torch.cat(all_targets, 1)

        all_scores, best_targets = all_scores.topk(2,
                                                   dim=1,
                                                   largest=True,
                                                   sorted=True)
        all_targets = all_targets.gather(1, best_targets)

    # contextual dissimilarity measure
    elif params.dico_method.startswith('csls_knn_'):

        knn = params.dico_method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)

        # average distances to k nearest neighbors
        average_dist1 = torch.from_numpy(get_nn_avg_dist(emb2, emb1, knn))
        average_dist2 = torch.from_numpy(get_nn_avg_dist(emb1, emb2, knn))
        average_dist1 = average_dist1.type_as(emb1)
        average_dist2 = average_dist2.type_as(emb2)

        # for every source word
        for i in range(0, n_src, bs):

            # compute target words scores
            scores = emb2.mm(emb1[i:min(n_src, i + bs)].transpose(
                0, 1)).transpose(0, 1)
            scores.mul_(2)
            scores.sub_(average_dist1[i:min(n_src, i + bs)][:, None] +
                        average_dist2[None, :])
            best_scores, best_targets = scores.topk(2,
                                                    dim=1,
                                                    largest=True,
                                                    sorted=True)

            # update scores / potential targets
            all_scores.append(best_scores.cpu())
            all_targets.append(best_targets.cpu())

        all_scores = torch.cat(all_scores, 0)
        all_targets = torch.cat(all_targets, 0)

    all_pairs = torch.cat([
        torch.arange(0, all_targets.size(0)).long().unsqueeze(1),
        all_targets[:, 0].unsqueeze(1)
    ], 1)

    # sanity check
    assert all_scores.size() == all_pairs.size() == (n_src, 2)

    # sort pairs by score confidence
    diff = all_scores[:, 0] - all_scores[:, 1]
    reordered = diff.sort(0, descending=True)[1]
    all_scores = all_scores[reordered]
    all_pairs = all_pairs[reordered]

    # max dico words rank
    if params.dico_max_rank > 0:
        selected = all_pairs.max(1)[0] <= params.dico_max_rank
        mask = selected.unsqueeze(1).expand_as(all_scores).clone()
        all_scores = all_scores.masked_select(mask).view(-1, 2)
        all_pairs = all_pairs.masked_select(mask).view(-1, 2)

    # max dico size
    if params.dico_max_size > 0:
        all_scores = all_scores[:params.dico_max_size]
        all_pairs = all_pairs[:params.dico_max_size]

    # min dico size
    diff = all_scores[:, 0] - all_scores[:, 1]
    if params.dico_min_size > 0:
        diff[:params.dico_min_size] = 1e9

    # confidence threshold
    if params.dico_threshold > 0:
        mask = diff > params.dico_threshold
        logger.info("Selected %i / %i pairs above the confidence threshold." %
                    (mask.sum(), diff.size(0)))
        mask = mask.unsqueeze(1).expand_as(all_pairs).clone()
        all_pairs = all_pairs.masked_select(mask).view(-1, 2)

    return all_pairs
Example #4
0
def get_candidates(emb1, emb2, args):
    bs = 128

    all_scores = []
    all_targets = []
    n_src = emb1.size(0)
    if args.dico_method == 'nn':

        for i in range(0, n_src, bs):
            scores = emb2.mm(emb1[i:min(n_src, i + bs)]
                             .transpose(0, 1)).transpose(0, 1)
            best_scores, best_targets = scores.topk(2,
                                                    dim=1,
                                                    largest=True,
                                                    sorted=True)
            all_scores.append(best_scores.cpu())
            all_targets.append(best_targets.cpu())

        # all_scores: for each emb1 vocab, contains top-2 "close" word's scores
        all_scores = torch.cat(all_scores, 0)
        # all_targets:
        #   for each emb1 vocab, contains top-2 "close" word's indices
        all_targets = torch.cat(all_targets, 0)

    if args.dico_method.startswith('csls_knn_'):
        knn = args.dico_method[len('csls_knn_'):]
        knn = int(knn)

        ave_dist1 = torch.from_numpy(get_nn_avg_dist(emb2, emb1, knn))
        ave_dist2 = torch.from_numpy(get_nn_avg_dist(emb1, emb2, knn))
        ave_dist1 = ave_dist1.type_as(emb1)
        ave_dist2 = ave_dist2.type_as(emb2)

        for i in range(0, n_src, bs):
            scores = emb2.mm((emb1[i:min(n_src, i + bs)])
                             .transpose(0, 1)).transpose(0, 1)
            scores.mul_(2)

            scores.sub_(ave_dist1[i:min(n_src, i + bs)][:, None]
                        + ave_dist2[None, :])

            best_scores, best_targets =\
                scores.topk(2, dim=1, largest=True, sorted=True)

            all_scores.append(best_scores.cpu())
            all_targets.append(best_targets.cpu())

        all_scores = torch.cat(all_scores, 0)
        all_targets = torch.cat(all_targets, 0)

    all_pairs = torch.cat([
        torch.arange(0, all_targets.size(0)).long().unsqueeze(1),
        all_targets[:, 0].unsqueeze(1)
        ], 1)

    assert all_scores.size() == all_pairs.size() == (n_src, 2)

    # sort paris by score confidence
    diff = all_scores[:, 0] - all_scores[:, 1]
    reordered = diff.sort(0, descending=True)[1]
    all_scores = all_scores[reordered]
    all_pairs = all_pairs[reordered]

    if args.dico_max_rank > 0:
        selected = all_pairs.max(1)[0] <= args.dico_max_rank
        mask = selected.unsqueeze(1).expand_as(all_scores).clone()
        all_scores = all_scores.masked_select(mask).view(-1, 2)
        all_pairs = all_pairs.masked_select(mask).view(-1, 2)

    if args.dico_max_size > 0:
        all_scores = all_scores[:args.dico_max_size]
        all_pairs = all_pairs[:args.dico_max_size]

    diff = all_scores[:, 0] - all_scores[:, 1]
    if args.dico_min_size > 0:
        diff[:args.dico_min_size] = 1e9

    if args.dico_threshold > 0:
        mask = diff > args.dico_threshold
        mask = mask.unsqueeze(1).expand_as(all_pairs).clone()
        all_pairs = all_pairs.masked_select(mask).view(-1, 2)

    return all_pairs
Example #5
0
def get_word_translation_accuracy(lang1, word2id1, emb1, lang2, word2id2, emb2, method):
    """
    Given source and target word embeddings, and a dictionary,
    evaluate the translation accuracy using the precision@k.
    """
    path = os.path.join(DIC_EVAL_PATH, '%s-%s.txt' % (lang1, lang2))
    dico = load_dictionary(path, word2id1, word2id2)
    dico = dico.cuda() if emb1.is_cuda else dico

    assert dico[:, 0].max() < emb1.size(0)
    assert dico[:, 1].max() < emb2.size(0)

    # normalize word embeddings
    emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1)
    emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2)

    # nearest neighbors
    if method == 'nn':
        query = emb1[dico[:, 0]]
        scores = query.mm(emb2.transpose(0, 1))

    # inverted softmax
    elif method.startswith('invsm_beta_'):
        beta = float(method[len('invsm_beta_'):])
        bs = 128
        word_scores = []
        for i in range(0, emb2.size(0), bs):
            scores = emb1.mm(emb2[i:i + bs].transpose(0, 1))
            scores.mul_(beta).exp_()
            scores.div_(scores.sum(0, keepdim=True).expand_as(scores))
            word_scores.append(scores.index_select(0, dico[:, 0]))
        scores = torch.cat(word_scores, 1)

    # contextual dissimilarity measure
    elif method.startswith('csls_knn_'):
        # average distances to k nearest neighbors
        knn = method[len('csls_knn_'):]
        assert knn.isdigit()
        knn = int(knn)
        average_dist1 = get_nn_avg_dist(emb2, emb1, knn)
        average_dist2 = get_nn_avg_dist(emb1, emb2, knn)
        average_dist1 = torch.from_numpy(average_dist1).type_as(emb1)
        average_dist2 = torch.from_numpy(average_dist2).type_as(emb2)
        # queries / scores
        query = emb1[dico[:, 0]]
        scores = query.mm(emb2.transpose(0, 1))
        scores.mul_(2)
        scores.sub_(average_dist1[dico[:, 0]][:, None] + average_dist2[None, :])

    else:
        raise Exception('Unknown method: "%s"' % method)

    results = []
    top_matches = scores.topk(100, 1, True)[1]
    for k in [1, 5, 10]:
        top_k_matches = top_matches[:, :k]
        _matching = (top_k_matches == dico[:, 1][:, None].expand_as(top_k_matches)).sum(1)
        # allow for multiple possible translations
        matching = {}
        for i, src_id in enumerate(dico[:, 0]):
            matching[src_id] = min(matching.get(src_id, 0) + _matching[i], 1)
        # evaluate precision@k
        precision_at_k = 100 * np.mean(list(matching.values()))
        logger.info("%i source words - %s - Precision at k = %i: %f" %
                    (len(matching), method, k, precision_at_k))
        results.append(('precision_at_%i' % k, precision_at_k))

    return results