def main(): ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec') en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec') print("loaded the dictionaries") ja_dic.apply_transform('alignment_matrices/ja.txt') en_dic.apply_transform('alignment_matrices/en.txt') print("transformed the dictionaries") idx = 0 result = {} result_f = open("en_ja_multifast.txt", "w") en_word_list = list(en_dic.word2id.keys()) print("The total length of English pretrained vector : " + str(len(en_word_list))) for en_word in tqdm(en_word_list): ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=15) result[en_word] = ja_words idx += 1 result[en_word] = ja_words resut_str = ",".join(result[en_word]) result_f.write(str(idx) + "," + en_word + "," + resut_str + "\n") if idx > 5000: exit() result_f.close()
def main2(): for zzz in LANGUAGE_LIST: lang = zzz[0] # get original embed system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # project with LIB-matrix lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang)) lang_dict.apply_transform("%s/alignment_matrices/%s.txt" % (LIB_DIR, lang)) lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
def main(): # first get the English one lang = "en" system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # en_dict = FastVector(vector_file='%s/wiki.en.vec' % OUT_DIR) for zzz in LANGUAGE_LIST: lang, fnames = zzz[0], zzz[1] printing("Dealing with lang %s." % lang) for curf in ["train", "dev", "test"]: out_fname = "%s/%s_%s.conllu" % (OUT_DIR, lang, curf) fout = zopen(out_fname, "w") for fname in fnames: last_name = fname.split("-")[-1].lower() path_name = "%s/%s/%s_%s-ud-%s.conllu" % (UD2_DIR, fname, lang, last_name, curf) if os.path.exists(path_name): with zopen(path_name) as fin: deal_conll_file(fin, fout) fout.close() # stat system('cat %s | grep -E "^$" | wc' % out_fname, pp=True) system('cat %s | grep -Ev "^$" | wc' % out_fname, pp=True) system( "cat %s | grep -Ev '^$' | cut -f 5 -d $'\t'| grep -Ev 'PUNCT|SYM' | wc" % out_fname, pp=True) # get original embed system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # project with LIB-matrix lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang)) lang_dict.apply_transform("%s/alignment_matrices/%s.txt" % (LIB_DIR, lang)) lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
# return orthogonal transformation which aligns source language to the target return np.matmul(U, V) lang1_dictionary = FastVector(vector_file=args.lang1) lang2_dictionary = FastVector(vector_file=args.lang2) bilingual_dictionary = [] file_object = open(args.dict, "r") lines = file_object.readlines() for line in lines: line = re.sub(r'\n', '', line) w_lang2, w_lang1 = line.split('\t') if w_lang1 in lang1_dictionary.word2id.keys( ) and w_lang2 in lang2_dictionary.word2id.keys(): bilingual_dictionary.append(tuple((w_lang2, w_lang1))) print("Dic Size: " + str(len(bilingual_dictionary))) # form the training matrices# form source_matrix, target_matrix = make_training_matrices(lang1_dictionary, lang2_dictionary, bilingual_dictionary) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) lang1_dictionary.apply_transform(transform) lang1_dictionary.export(args.out1) lang2_dictionary.export(args.out2)
ru_words = set(ru_dictionary.word2id.keys()) fr_words = set(fr_dictionary.word2id.keys()) overlap = list(ru_words & fr_words) bilingual_dictionary = [(entry, entry) for entry in overlap] # Let's align the French vectors to the Russian vectors, using only this "free" dictionary that we acquired without any bilingual expert knowledge. # In[ ]: # form the training matrices source_matrix, target_matrix = make_training_matrices(fr_dictionary, ru_dictionary, bilingual_dictionary) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) fr_dictionary.apply_transform(transform) # Finally, we re-evaluate the similarity of "chat" and "кот": # In[4]: fr_vector = fr_dictionary["chat"] ru_vector = ru_dictionary["кот"] print(FastVector.cosine_similarity(fr_vector, ru_vector)) # "chat" and "кот" are pretty similar after all :) # # Use this simple "identical strings" trick to align other language pairs for yourself, or prepare your own expert bilingual dictionaries for optimal performance.
from fasttext import FastVector import json ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec') en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec') print("loaded the dictionaries") ja_dic.apply_transform('alignment_matrices/ja.txt') en_dic.apply_transform('alignment_matrices/en.txt') print("transformed the dictionaries") en_word_list = [ "cat", "dog", "apple", "car", "train", "school", "student", "teacher" ] ja_word_list = ["猫", "犬", "りんご", "車", "電車", "学校", "生徒", "先生"] result_f = open("multi_fast.txt", "w") result = {} # Ja_word_list 10 nearest neighbor for ja_word in ja_word_list: en_words = en_dic.translate_k_nearest_neighbour(ja_dic[ja_word], k=20) result[ja_word] = en_words resut_str = ",".join(result[ja_word]) result_f.write(ja_word + "," + resut_str + "\n") # En_word_list 10 nearest neighbor for en_word in en_word_list: ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=20) result[en_word] = ja_words resut_str = ",".join(result[en_word]) result_f.write(en_word + "," + resut_str + "\n")
def train(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tar, source='tgt') train_data_src_mono = read_corpus(args.train_src_mono, source='src') train_data_tgt_mono = read_corpus(args.train_tar_mono, source='tgt') dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tar, source='tgt') train_data = list( zip(train_data_src, train_data_tgt, train_data_src_mono, train_data_tgt_mono)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args.batch_size) valid_niter = int(args.valid_iter) log_every = int(args.log_every) model_save_path = args.save_path "Vocab dict" vocab_src = pickle.load(open(args.vocab_src, 'rb')) vocab_tar = pickle.load(open(args.vocab_tar, 'rb')) "Optimizer params" s2s_param = [] t2t_param = [] s2t_param = [] t2s_param = [] "Embed" # pretrained (and fixed), cross-lingual embeddings args.embed_size = 300 from fasttext import FastVector src_embed_path = 'embed/' + args.embed_src tar_embed_path = 'embed/' + args.embed_tar try: vectors_src = pickle.load(open(src_embed_path, 'rb')) except FileNotFoundError: vectors_src = FastVector(vector_file=args.embed_src) vectors_src.apply_transform(args.embed_alignment) pickle.dump(vectors_src, open(src_embed_path, 'wb+')) try: vectors_tar = pickle.load(open(tar_embed_path, 'rb')) except FileNotFoundError: vectors_tar = FastVector( vector_file=args.embed_tar) # tar is en, no alignment required pickle.dump(vectors_tar, open(tar_embed_path, 'wb+')) src2embed = lambda word: torch.FloatTensor(vectors_src[ word]) if word in vectors_src else torch.zeros(300) tar2embed = lambda word: torch.FloatTensor(vectors_tar[ word]) if word in vectors_tar else torch.zeros(300) embedder_src = Embedder( vocab_src.dict_size(), args.embed_size, nn.Embedding.from_pretrained(torch.stack([ src2embed(word.lower() if word is not None else word) for word in vocab_src.id2word ], dim=0), freeze=True)) embedder_tar = Embedder( vocab_tar.dict_size(), args.embed_size, nn.Embedding.from_pretrained(torch.stack([ tar2embed(word.lower() if word is not None else word) for word in vocab_tar.id2word ], dim=0), freeze=True)) "Generator" gen_src = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda() gen_src_wrapper = WrapperEmbeddingGenerator(gen_src, embedder_src).cuda() gen_tar = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda() gen_tar_wrapper = WrapperEmbeddingGenerator(gen_tar, embedder_tar).cuda() if args.gen_src != "": gen_src_wrapper.load_weight(args.gen_src) else: [s2s_param, s2t_param, t2s_param] = add_to_optimizer(gen_src_wrapper, [s2s_param, s2t_param, t2s_param]) if args.gen_tar != "": gen_tar_wrapper.load_weight(args.gen_tar) else: [s2t_param, t2s_param, t2t_param] = add_to_optimizer(gen_tar_wrapper, [s2t_param, t2s_param, t2t_param]) if args.multi_gpu: gen_src_wrapper = nn.DataParallel(gen_src_wrapper, device_ids=[0, 1]) gen_tar_wrapper = nn.DataParallel(gen_tar_wrapper, device_ids=[0, 1]) "encoder" # shared encoder encoder = GRUEncoder(args.embed_size, args.hidden_size, bidirectional=args.encoder_bidir, layers=args.encoder_layer, dropout=args.dropout).cuda() if args.multi_gpu: encoder = nn.DataParallel(encoder, device_ids=[0, 1]) [s2s_param, s2t_param, t2s_param, t2t_param ] = add_to_optimizer(encoder, [s2s_param, s2t_param, t2s_param, t2t_param]) "Decoder" decoder_src = AttentionDecoder(args.embed_size, args.hidden_size, 1, args.dropout, input_feed=True).cuda() decoder_tar = AttentionDecoder(args.embed_size, args.hidden_size, 1, args.dropout, input_feed=True).cuda() if args.multi_gpu: decoder_src = nn.DataParallel(decoder_src, device_ids=[0, 1]) decoder_tar = nn.DataParallel(decoder_tar, device_ids=[0, 1]) [s2s_param, s2t_param, t2s_param] = add_to_optimizer(decoder_src, [s2s_param, s2t_param, t2s_param]) [s2t_param, t2s_param, t2t_param] = add_to_optimizer(decoder_tar, [s2t_param, t2s_param, t2t_param]) "Translators" s2s_model = MT(vocab_src, vocab_src, embedder_src, embedder_src, gen_src_wrapper, encoder, decoder_src, denoising=True, multi_gpu=args.multi_gpu) t2t_model = MT(vocab_tar, vocab_tar, embedder_tar, embedder_tar, gen_tar_wrapper, encoder, decoder_tar, denoising=True, multi_gpu=args.multi_gpu) s2t_model = MT(vocab_src, vocab_tar, embedder_src, embedder_tar, gen_tar_wrapper, encoder, decoder_tar, denoising=False, multi_gpu=args.multi_gpu) t2s_model = MT(vocab_tar, vocab_src, embedder_tar, embedder_src, gen_src_wrapper, encoder, decoder_src, denoising=False, multi_gpu=args.multi_gpu) "optimizers" s2s_optimizer = torch.optim.Adam(s2s_param, lr=args.lr) t2t_optimizer = torch.optim.Adam(t2t_param, lr=args.lr) s2t_optimizer = torch.optim.Adam(s2t_param, lr=args.lr) t2s_optimizer = torch.optim.Adam(t2s_param, lr=args.lr) def save_model(): # save embedder if args.embed_src == "": embedder_src.save_weight(args.save_path + "/embed_src.bin") if args.embed_tar == "": embedder_tar.save_weight(args.save_path + "/embed_tar.bin") # save generator if args.gen_src == "": gen_src_wrapper.save_weight(args.save_path + "/gen_src.bin") if args.gen_tar == "": gen_tar_wrapper.save_weight(args.save_path + "/gen_tar.bin") # save encoder encoder.save_weight(args.save_path + "/encoder.bin") # save decoder decoder_src.save_weight(args.save_path + "/decoder_src.bin") decoder_tar.save_weight(args.save_path + "/decoder_tar.bin") # save optimizer print("all models saved") def train_step(mt, optimizer, src_sents, tar_sents): optimizer.zero_grad() loss = mt.get_loss(src_sents, tar_sents, train=True) loss += loss.data[0] res = loss.cpu().detach().item() loss.div(args.batch_size).backward() optimizer.step() return res def train_step_backtranslate(mt, optimizer, src_sents, max_ratio): tar_sents = mt.greedy(src_sents, max_ratio, mode=False) res = train_step(mt, optimizer, src_sents, tar_sents) return res num_trial = 0 train_iter = patience = cum_loss = report_loss = cumulative_tgt_words = report_tgt_words = 0 cumulative_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') if not os.path.exists(args.save_path): os.mkdir(args.save_path) while True: epoch += 1 for src_sents, tgt_sents, src_mono_sents, tgt_mono_sents in batch_iter( train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 batch_size = len(src_sents) srclen = max(map(len, src_sents)) tar_len = max(map(len, tgt_sents)) print("SRCLEN {} TARLEN {}".format(srclen, tar_len)) model = s2t_model # (batch_size) train_step(s2s_model, s2s_optimizer, src_sents, src_sents) print("finish s2s") train_step(t2t_model, t2t_optimizer, tgt_sents, tgt_sents) print("finish t2t") train_step(s2s_model, s2s_optimizer, src_mono_sents, src_sents) print("finish s2s mono") train_step(t2t_model, t2t_optimizer, tgt_mono_sents, tgt_sents) print("finish t2t mono") train_step(t2s_model, t2s_optimizer, tgt_sents, src_sents) print("finish t2s") loss = train_step(model, s2t_optimizer, src_sents, tgt_sents) print("finish s2t") train_step_backtranslate(s2t_model, s2t_optimizer, src_sents, (tar_len / srclen)) print("finish s2t back") train_step_backtranslate(t2s_model, t2s_optimizer, tgt_sents, (srclen / tar_len)) print("finish t2s back") train_step_backtranslate(s2t_model, s2t_optimizer, src_mono_sents, (tar_len / srclen)) print("finish s2t back mono") train_step_backtranslate(t2s_model, t2s_optimizer, tgt_mono_sents, (srclen / tar_len)) print("finish t2s back mono") os.system("nvidia-smi") report_loss += loss cum_loss += loss tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cumulative_tgt_words += tgt_words_num_to_predict report_examples += batch_size cumulative_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cumulative_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # the following code performs validation on dev set, and controls the learning schedule # if the dev score is better than the last check point, then the current model is saved. # otherwise, we allow for that performance degeneration for up to `--patience` times; # if the dev score does not increase after `--patience` iterations, we reload the previously # saved best model (and the state of the optimizer), halve the learning rate and continue # training. This repeats for up to `--max-num-trial` times. if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cumulative_examples, np.exp(cum_loss / cumulative_tgt_words), cumulative_examples), file=sys.stderr) cum_loss = cumulative_examples = cumulative_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = model.evaluate_ppl( dev_data, batch_size=args.batch_size ) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) save_model() # if is_better: # patience = 0 # print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) # model.save(model_save_path) # # You may also save the optimizer's state # elif patience < int(args.patience): # patience += 1 # print('hit patience %d' % patience, file=sys.stderr) # if patience == int(args.patience): # num_trial += 1 # print('hit #%d trial' % num_trial, file=sys.stderr) # if num_trial == int(args.max_num_trail): # print('early stop!', file=sys.stderr) # exit(0) # # decay learning rate, and restore from previously best checkpoint # lr = lr * float(args.lr_decay) # print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # # load model # model_save_path # print('restore parameters of the optimizers', file=sys.stderr) # # You may also need to load the state of the optimizer saved before # # reset patience # patience = 0 if epoch == int(args.max_epoch): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
en_vector = en_dictionary["love"] zh_vector = zh_dictionary["爱"] # going to print 0.0004326613965749648 print(FastVector.cosine_similarity(en_vector, zh_vector)) zh_words = set(zh_dictionary.word2id.keys()) en_words = set(en_dictionary.word2id.keys()) overlap = list(zh_words & en_words) bilingual_dictionary = [(entry, entry) for entry in overlap] # form the training matrices source_matrix, target_matrix = make_training_matrices(en_dictionary, zh_dictionary, bilingual_dictionary) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) en_dictionary.apply_transform(transform) en_vector = en_dictionary["love"] zh_vector = zh_dictionary["爱"] # going to print 0.18727020978991674 print(FastVector.cosine_similarity(en_vector, zh_vector)) en_dictionary.export("cc.en.aligned.to.zh.vec") embedding = gluonnlp.embedding.FastText.from_file('cc.en.aligned.to.zh.vec') embedding.serialize('cc.en.300.aligned.to.zh.vec.npz')
print "Readling Dictionary" BI_DICT = codecs.open("o.s2t_f", "r").readlines() BI_DICT = parse_BI(BI_DICT) print "Readling Dictionary (END)" # SRC_WORD = "昨天" # TGT_WORD = "yesterday" SRC_WORD = "钥匙" TGT_WORD = "keys" en_dictionary = FastVector(vector_file='en.emb.orig.vec') other_dictionary = FastVector(vector_file='tizh.emb.orig.vec') test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) # form the training matrices print "Learning SVD" source_matrix, target_matrix = make_training_matrices(other_dictionary, en_dictionary, BI_DICT) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) other_dictionary.apply_transform(transform) # zh test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) # ti SRC_WORD = "กุญ" test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) print "Writing transform Qe out" other_dictionary.export("Qe")
if __name__ == "__main__": # load the datasets and perform split into training and test set dir = os.path.join(os.getcwd(), "expcode", "numerical_code") en_corpus = pickle.load(open(os.path.join(dir, 'english_vocab.pkl'), 'rb'))[:100] # CHANGE THIS WHEN WE HAVE DB fr_corpus = pickle.load(open(os.path.join(dir, 'french_vocab.pkl'), 'rb'))[:100] # CHANGE THIS WHEN WE HAVE DB # load the counts and co-occurences en_dict = FastVector( vector_file='/Users/williamst-arnaud/Downloads/cc.en.300.vec') fr_dict = FastVector( vector_file='/Users/williamst-arnaud/Downloads/cc.fr.300.vec') en_dict.apply_transform( '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/en.txt' ) fr_dict.apply_transform( '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/fr.txt' ) # number of items in dataset n = len(en_corpus) start = time.time() w, l, losses = train(en_corpus, fr_corpus, 2 * 300, lamb=1. / n, nb_epochs=5 * n) print(time.time() - start)