Example #1
0
def main2():
    for zzz in LANGUAGE_LIST:
        lang = zzz[0]
        # get original embed
        system(
            "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
            % (OUT_DIR, lang, lang),
            pp=True)
        # project with LIB-matrix
        lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang))
        lang_dict.apply_transform("%s/alignment_matrices/%s.txt" %
                                  (LIB_DIR, lang))
        lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
Example #2
0
def main():
    # first get the English one
    lang = "en"
    system(
        "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
        % (OUT_DIR, lang, lang),
        pp=True)
    # en_dict = FastVector(vector_file='%s/wiki.en.vec' % OUT_DIR)
    for zzz in LANGUAGE_LIST:
        lang, fnames = zzz[0], zzz[1]
        printing("Dealing with lang %s." % lang)
        for curf in ["train", "dev", "test"]:
            out_fname = "%s/%s_%s.conllu" % (OUT_DIR, lang, curf)
            fout = zopen(out_fname, "w")
            for fname in fnames:
                last_name = fname.split("-")[-1].lower()
                path_name = "%s/%s/%s_%s-ud-%s.conllu" % (UD2_DIR, fname, lang,
                                                          last_name, curf)
                if os.path.exists(path_name):
                    with zopen(path_name) as fin:
                        deal_conll_file(fin, fout)
            fout.close()
            # stat
            system('cat %s | grep -E "^$" | wc' % out_fname, pp=True)
            system('cat %s | grep -Ev "^$" | wc' % out_fname, pp=True)
            system(
                "cat %s | grep -Ev '^$' | cut -f 5 -d $'\t'| grep -Ev 'PUNCT|SYM' | wc"
                % out_fname,
                pp=True)
        # get original embed
        system(
            "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
            % (OUT_DIR, lang, lang),
            pp=True)
        # project with LIB-matrix
        lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang))
        lang_dict.apply_transform("%s/alignment_matrices/%s.txt" %
                                  (LIB_DIR, lang))
        lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


lang1_dictionary = FastVector(vector_file=args.lang1)
lang2_dictionary = FastVector(vector_file=args.lang2)

bilingual_dictionary = []
file_object = open(args.dict, "r")
lines = file_object.readlines()
for line in lines:
    line = re.sub(r'\n', '', line)
    w_lang2, w_lang1 = line.split('\t')
    if w_lang1 in lang1_dictionary.word2id.keys(
    ) and w_lang2 in lang2_dictionary.word2id.keys():
        bilingual_dictionary.append(tuple((w_lang2, w_lang1)))

print("Dic Size: " + str(len(bilingual_dictionary)))

# form the training matrices# form
source_matrix, target_matrix = make_training_matrices(lang1_dictionary,
                                                      lang2_dictionary,
                                                      bilingual_dictionary)
# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
lang1_dictionary.apply_transform(transform)

lang1_dictionary.export(args.out1)
lang2_dictionary.export(args.out2)
Example #4
0
en_vector = en_dictionary["love"]
zh_vector = zh_dictionary["爱"]

# going to print 0.0004326613965749648
print(FastVector.cosine_similarity(en_vector, zh_vector))

zh_words = set(zh_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())
overlap = list(zh_words & en_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]

# form the training matrices
source_matrix, target_matrix = make_training_matrices(en_dictionary,
                                                      zh_dictionary,
                                                      bilingual_dictionary)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
en_dictionary.apply_transform(transform)

en_vector = en_dictionary["love"]
zh_vector = zh_dictionary["爱"]

# going to print 0.18727020978991674
print(FastVector.cosine_similarity(en_vector, zh_vector))

en_dictionary.export("cc.en.aligned.to.zh.vec")

embedding = gluonnlp.embedding.FastText.from_file('cc.en.aligned.to.zh.vec')
embedding.serialize('cc.en.300.aligned.to.zh.vec.npz')
Example #5
0
print "Readling Dictionary"
BI_DICT = codecs.open("o.s2t_f", "r").readlines()
BI_DICT = parse_BI(BI_DICT)
print "Readling Dictionary (END)"

# SRC_WORD = "昨天"
# TGT_WORD = "yesterday"
SRC_WORD = "钥匙"
TGT_WORD = "keys"
en_dictionary = FastVector(vector_file='en.emb.orig.vec')
other_dictionary = FastVector(vector_file='tizh.emb.orig.vec')

test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)

# form the training matrices
print "Learning SVD"
source_matrix, target_matrix = make_training_matrices(other_dictionary,
                                                      en_dictionary, BI_DICT)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
other_dictionary.apply_transform(transform)
# zh
test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)
# ti
SRC_WORD = "กุญ"
test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)

print "Writing transform Qe out"
other_dictionary.export("Qe")
Example #6
0
#coding=utf-8
from fasttext import FastVector
fr_dictionary = FastVector(vector_file='wiki.en.vec')
fr_dictionary.export('fr.vec.txt')