def main2(): for zzz in LANGUAGE_LIST: lang = zzz[0] # get original embed system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # project with LIB-matrix lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang)) lang_dict.apply_transform("%s/alignment_matrices/%s.txt" % (LIB_DIR, lang)) lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
def main(): # first get the English one lang = "en" system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # en_dict = FastVector(vector_file='%s/wiki.en.vec' % OUT_DIR) for zzz in LANGUAGE_LIST: lang, fnames = zzz[0], zzz[1] printing("Dealing with lang %s." % lang) for curf in ["train", "dev", "test"]: out_fname = "%s/%s_%s.conllu" % (OUT_DIR, lang, curf) fout = zopen(out_fname, "w") for fname in fnames: last_name = fname.split("-")[-1].lower() path_name = "%s/%s/%s_%s-ud-%s.conllu" % (UD2_DIR, fname, lang, last_name, curf) if os.path.exists(path_name): with zopen(path_name) as fin: deal_conll_file(fin, fout) fout.close() # stat system('cat %s | grep -E "^$" | wc' % out_fname, pp=True) system('cat %s | grep -Ev "^$" | wc' % out_fname, pp=True) system( "cat %s | grep -Ev '^$' | cut -f 5 -d $'\t'| grep -Ev 'PUNCT|SYM' | wc" % out_fname, pp=True) # get original embed system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # project with LIB-matrix lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang)) lang_dict.apply_transform("%s/alignment_matrices/%s.txt" % (LIB_DIR, lang)) lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
# return orthogonal transformation which aligns source language to the target return np.matmul(U, V) lang1_dictionary = FastVector(vector_file=args.lang1) lang2_dictionary = FastVector(vector_file=args.lang2) bilingual_dictionary = [] file_object = open(args.dict, "r") lines = file_object.readlines() for line in lines: line = re.sub(r'\n', '', line) w_lang2, w_lang1 = line.split('\t') if w_lang1 in lang1_dictionary.word2id.keys( ) and w_lang2 in lang2_dictionary.word2id.keys(): bilingual_dictionary.append(tuple((w_lang2, w_lang1))) print("Dic Size: " + str(len(bilingual_dictionary))) # form the training matrices# form source_matrix, target_matrix = make_training_matrices(lang1_dictionary, lang2_dictionary, bilingual_dictionary) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) lang1_dictionary.apply_transform(transform) lang1_dictionary.export(args.out1) lang2_dictionary.export(args.out2)
en_vector = en_dictionary["love"] zh_vector = zh_dictionary["爱"] # going to print 0.0004326613965749648 print(FastVector.cosine_similarity(en_vector, zh_vector)) zh_words = set(zh_dictionary.word2id.keys()) en_words = set(en_dictionary.word2id.keys()) overlap = list(zh_words & en_words) bilingual_dictionary = [(entry, entry) for entry in overlap] # form the training matrices source_matrix, target_matrix = make_training_matrices(en_dictionary, zh_dictionary, bilingual_dictionary) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) en_dictionary.apply_transform(transform) en_vector = en_dictionary["love"] zh_vector = zh_dictionary["爱"] # going to print 0.18727020978991674 print(FastVector.cosine_similarity(en_vector, zh_vector)) en_dictionary.export("cc.en.aligned.to.zh.vec") embedding = gluonnlp.embedding.FastText.from_file('cc.en.aligned.to.zh.vec') embedding.serialize('cc.en.300.aligned.to.zh.vec.npz')
print "Readling Dictionary" BI_DICT = codecs.open("o.s2t_f", "r").readlines() BI_DICT = parse_BI(BI_DICT) print "Readling Dictionary (END)" # SRC_WORD = "昨天" # TGT_WORD = "yesterday" SRC_WORD = "钥匙" TGT_WORD = "keys" en_dictionary = FastVector(vector_file='en.emb.orig.vec') other_dictionary = FastVector(vector_file='tizh.emb.orig.vec') test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) # form the training matrices print "Learning SVD" source_matrix, target_matrix = make_training_matrices(other_dictionary, en_dictionary, BI_DICT) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) other_dictionary.apply_transform(transform) # zh test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) # ti SRC_WORD = "กุญ" test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) print "Writing transform Qe out" other_dictionary.export("Qe")
#coding=utf-8 from fasttext import FastVector fr_dictionary = FastVector(vector_file='wiki.en.vec') fr_dictionary.export('fr.vec.txt')