Esempio n. 1
0
def ground_truth(en_sent, fr_sent):
    """
    Function that extracts the ground truth for a pair of sentences in english and french
    :param en_sent: The the sentence in english
    :param fr_sent: The sentence in french
    :return:
    """
    # keys = set(fr_sent)

    # score matrix
    score = np.empty([len(en_sent), len(fr_sent)], dtype=np.float32)

    # label
    truth = np.zeros([len(en_sent), len(fr_sent)], dtype=np.float32)

    # we find the ground truth. We randomize access to break ties randomly
    for j in range(len(en_sent)):
        for k in range(len(fr_sent)):
            score[j, k] = FastVector.cosine_similarity(en_dict[en_sent[j]],
                                                       fr_dict[fr_sent[k]])

    # we find the ground truth. We randomize access to break ties randomly
    for j in range(len(en_sent)):
        argmax = int(score[j].argmax())
        truth[j, argmax] = 1.

    return truth.reshape(-1)
Esempio n. 2
0
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


# Now we load the French and Russian word vectors, and evaluate the similarity of "chat" and "кот":

# In[2]:

fr_dictionary = FastVector(vector_file='zh_vec.txt')
ru_dictionary = FastVector(vector_file='en_vec.txt')

fr_vector = fr_dictionary["chat"]
ru_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))

# "chat" and "кот" both mean "cat", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, we need a bilingual dictionary of French and Russian translation pairs. As it happens, this is a great opportunity to show you something truly amazing...
#
# Many words appear in the vocabularies of more than one language; words like "alberto", "london" and "presse". These words usually mean similar things in each language. Therefore we can form a bilingual dictionary, by simply extracting every word that appears in both the French and Russian vocabularies.

# In[3]:

ru_words = set(ru_dictionary.word2id.keys())
fr_words = set(fr_dictionary.word2id.keys())
overlap = list(ru_words & fr_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]

# Let's align the French vectors to the Russian vectors, using only this "free" dictionary that we acquired without any bilingual expert knowledge.

# In[ ]:
Esempio n. 3
0
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


# copy embedding files from https://fasttext.cc/docs/en/crawl-vectors.html#models
en_dictionary = FastVector(vector_file='cc.en.300.vec')
zh_dictionary = FastVector(vector_file='cc.zh.300.vec')

en_vector = en_dictionary["love"]
zh_vector = zh_dictionary["爱"]

# going to print 0.0004326613965749648
print(FastVector.cosine_similarity(en_vector, zh_vector))

zh_words = set(zh_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())
overlap = list(zh_words & en_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]

# form the training matrices
source_matrix, target_matrix = make_training_matrices(en_dictionary,
                                                      zh_dictionary,
                                                      bilingual_dictionary)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
en_dictionary.apply_transform(transform)
Esempio n. 4
0
        except ValueError:
            continue

        if it_w not in it_dictionary:
            print(it_w + " - " + fr_w + " - it word not found")
            if (lang == skip_lang):
                continue

        if fr_w not in fr_dictionary:
            print(it_w + " - " + fr_w + " - " + lang + " word not found")
            if (lang == skip_lang):
                continue

        # Cosine between words
        try:
            cosine = FastVector.cosine_similarity(it_dictionary[it_w],
                                                  fr_dictionary[fr_w])
        except KeyError:
            cosine = 1

        # Synonyms list
        synonyms_list = []
        if it_w in synonyms_dict:
            synonyms_list = synonyms_dict[it_w]

        if len(synonyms_list) < 1:
            synonyms_list.append(it_w)

        # Array containing all cosines from synonyms
        synonyms_cosine_list = []
        for s in synonyms_list:
            if s not in it_dictionary:
Esempio n. 5
0
def test_word(en_dictionary, other_dictionary, SRC_WORD, TGT_WORD):
    print "Testing WORD[%s->%s]" % (SRC_WORD, TGT_WORD)
    en_vector = en_dictionary[SRC_WORD]
    other_vector = other_dictionary[TGT_WORD]
    print(FastVector.cosine_similarity(en_vector, other_vector))
Esempio n. 6
0
    args = parse_args()

    print('loading vectors')
    en_dictionary = FastVector(vector_file=args.en_embedding)
    fr_dictionary = FastVector(vector_file=args.fr_embedding)

    #print('transforming vectors')
    #fr_dictionary.apply_transform('alignment_matrices/fr.txt')

    #print('CCA...')
    #en_fr = read_dictionary(args.embedding_path+'en_fr.txt')
    #en_dictionary.embed, fr_dictionary.embed = cca(en_dictionary, fr_dictionary, en_fr, dim=250)

    print(
        "Hello score:",
        FastVector.cosine_similarity(en_dictionary["hello"],
                                     fr_dictionary["bonjour"]))

    print('processing data')
    en_train_file = args.source_path + 'en_train.tsv'
    en_test_file = args.source_path + 'en_test.tsv'
    fr_train_file = args.source_path + 'fr_train.tsv'
    fr_test_file = args.source_path + 'fr_test.tsv'

    print('english train')
    en_train_df = read_dataset(en_train_file)
    en_train_y, en_train_x, en_vectorizor = process_dataset(
        en_train_df, en_dictionary, None)

    n_classes = len(set(en_train_y))
    label_encoder = dict(zip(list(set(en_train_y)), np.arange(n_classes)))
    en_train_y = np.array([label_encoder[i] for i in en_train_y])