def main(self): self.model = {} # Loading fastText self.model['eng'] = FastVector( vector_file='/Users/arai9814/model/wiki.en.vec') self.model['jpn'] = FastVector( vector_file='/Users/arai9814/model/wiki.ja.vec') self.model['fra'] = FastVector( vector_file='/Users/arai9814/model/wiki.fr.vec') # Transform multi-lingual vector to same vector space self.model['eng'].apply_transform('alignment_matrices/en.txt') self.model['jpn'].apply_transform('alignment_matrices/ja.txt') self.model['fra'].apply_transform('alignment_matrices/fr.txt') ver = wn.get_version() print("RESOURCE: WN " + str(ver) + "\n") print("LANGUAGE: " + str(self.langs) + "\n") print("VECTORS: " + self.folder + "\n") print("TARGET: " + self.folder + "\n") self.extractWordsAndSynsets(self.folder + "words.txt", self.folder + "synsets.txt", self.folder + "lemmas.txt") self.extractSynsetRelations(self.folder + "hypernym.txt", '@') self.extractSynsetRelations(self.folder + "similar.txt", '&') self.extractSynsetRelations(self.folder + "verbGroup.txt", '$') self.extractSynsetRelations(self.folder + "antonym.txt", '!') print("DONE")
def main(): ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec') en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec') print("loaded the dictionaries") ja_dic.apply_transform('alignment_matrices/ja.txt') en_dic.apply_transform('alignment_matrices/en.txt') print("transformed the dictionaries") idx = 0 result = {} result_f = open("en_ja_multifast.txt", "w") en_word_list = list(en_dic.word2id.keys()) print("The total length of English pretrained vector : " + str(len(en_word_list))) for en_word in tqdm(en_word_list): ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=15) result[en_word] = ja_words idx += 1 result[en_word] = ja_words resut_str = ",".join(result[en_word]) result_f.write(str(idx) + "," + en_word + "," + resut_str + "\n") if idx > 5000: exit() result_f.close()
def ground_truth(en_sent, fr_sent): """ Function that extracts the ground truth for a pair of sentences in english and french :param en_sent: The the sentence in english :param fr_sent: The sentence in french :return: """ # keys = set(fr_sent) # score matrix score = np.empty([len(en_sent), len(fr_sent)], dtype=np.float32) # label truth = np.zeros([len(en_sent), len(fr_sent)], dtype=np.float32) # we find the ground truth. We randomize access to break ties randomly for j in range(len(en_sent)): for k in range(len(fr_sent)): score[j, k] = FastVector.cosine_similarity(en_dict[en_sent[j]], fr_dict[fr_sent[k]]) # we find the ground truth. We randomize access to break ties randomly for j in range(len(en_sent)): argmax = int(score[j].argmax()) truth[j, argmax] = 1. return truth.reshape(-1)
def load_data(): global skt_dictionary, wdict_skt, skt_words, skt_stop print("Loading data") skt_dictionary = FastVector(vector_file='../data/skt_vectors.vec') wdict_skt = read_weight_dictionary("../data/word_count_skt.txt") skt_words = set(skt_dictionary.word2id.keys()) skt_stop = read_stopwords( "/home/basti/deeplearning/bilingual/skt2tib/data/skt_stop.txt")
def loadfasttextmodel(filename): filename='/home/ahmad/fastText_multilingual/' w2v=dict() #['en','es','zh','hr','de','fa','ar','fr']['es','en','de'] for lng in ['en','es','de','fa','ar','fr']: w2v[lng] = FastVector(vector_file=filename+'wiki.'+lng+'.vec') w2v[lng].apply_transform(filename+'alignment_matrices/'+lng+'.txt') return w2v
def cached_load_vecs(filename): if os.path.isfile(filename + '.pickle'): return pickle.load(open(filename + '.pickle', 'rb')) else: print(' slow read for', filename) vecs = FastVector(vector_file=filename) print(' caching pickle for', filename) try: pickle.dump(vecs, open(filename + 'pickle', 'wb')) except: print(' ..failed')
def loadmultilingualw2vmodel(filename): filename = '/home/ahmad/fastText_multilingual/' w2v = dict() w2v['fr'] = FastVector(vector_file=filename + 'wiki.fr.vec') w2v['fr'].apply_transform(filename + 'alignment_matrices/fr.txt') w2v['en'] = FastVector(vector_file=filename + 'wiki.en.vec') w2v['en'].apply_transform(filename + 'alignment_matrices/en.txt') w2v['es'] = FastVector(vector_file=filename + 'wiki.es.vec') w2v['es'].apply_transform(filename + 'alignment_matrices/es.txt') w2v['zh'] = FastVector(vector_file=filename + 'wiki.zh.vec') w2v['zh'].apply_transform(filename + 'alignment_matrices/zh.txt') w2v['hr'] = FastVector(vector_file=filename + 'wiki.hr.vec') w2v['hr'].apply_transform(filename + 'alignment_matrices/hr.txt') w2v['de'] = FastVector(vector_file=filename + 'wiki.de.vec') w2v['de'].apply_transform(filename + 'alignment_matrices/de.txt') #en_vector = w2v['en']["cat"] #es_vector = w2v['es']["gato"] #print(FastVector.cosine_similarity(es_vector, en_vector)) return w2v
def loadfasttextmodel(filename): filename = '/home/ahmad/fastText_multilingual/' w2v = dict() #['en','es','zh','hr','de','fa','ar','fr'] for lng in ['es', 'en', 'de']: w2v[lng] = FastVector(vector_file=filename + 'wiki.' + lng + '.vec') w2v[lng].apply_transform(filename + 'alignment_matrices/' + lng + '.txt') #en_vector = w2v['en']["cat"] #es_vector = w2v['es']["gato"] #print(FastVector.cosine_similarity(es_vector, en_vector)) return w2v
def IRI_encoder(IRIs): # loading word vectors vec = FastVector(vector_file='thin_2018-11-23_d100_e5.bin.vec') wordvector = [] for IRI in IRIs: if IRI in vec.word2id.keys(): idx = vec.word2id[IRI] wordvector.append(vec.embed[idx]) else: print(IRI) wordvector.append([0] * 100) wordvector = np.array(wordvector) return wordvector
def main2(): for zzz in LANGUAGE_LIST: lang = zzz[0] # get original embed system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # project with LIB-matrix lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang)) lang_dict.apply_transform("%s/alignment_matrices/%s.txt" % (LIB_DIR, lang)) lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
def main(): # first get the English one lang = "en" system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # en_dict = FastVector(vector_file='%s/wiki.en.vec' % OUT_DIR) for zzz in LANGUAGE_LIST: lang, fnames = zzz[0], zzz[1] printing("Dealing with lang %s." % lang) for curf in ["train", "dev", "test"]: out_fname = "%s/%s_%s.conllu" % (OUT_DIR, lang, curf) fout = zopen(out_fname, "w") for fname in fnames: last_name = fname.split("-")[-1].lower() path_name = "%s/%s/%s_%s-ud-%s.conllu" % (UD2_DIR, fname, lang, last_name, curf) if os.path.exists(path_name): with zopen(path_name) as fin: deal_conll_file(fin, fout) fout.close() # stat system('cat %s | grep -E "^$" | wc' % out_fname, pp=True) system('cat %s | grep -Ev "^$" | wc' % out_fname, pp=True) system( "cat %s | grep -Ev '^$' | cut -f 5 -d $'\t'| grep -Ev 'PUNCT|SYM' | wc" % out_fname, pp=True) # get original embed system( "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec" % (OUT_DIR, lang, lang), pp=True) # project with LIB-matrix lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang)) lang_dict.apply_transform("%s/alignment_matrices/%s.txt" % (LIB_DIR, lang)) lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
""" # optionally normalize the training vectors if normalize_vectors: source_matrix = normalized(source_matrix) target_matrix = normalized(target_matrix) # perform the SVD product = np.matmul(source_matrix.transpose(), target_matrix) U, s, V = np.linalg.svd(product) # return orthogonal transformation which aligns source language to the target return np.matmul(U, V) # copy embedding files from https://fasttext.cc/docs/en/crawl-vectors.html#models en_dictionary = FastVector(vector_file='cc.en.300.vec') zh_dictionary = FastVector(vector_file='cc.zh.300.vec') en_vector = en_dictionary["love"] zh_vector = zh_dictionary["爱"] # going to print 0.0004326613965749648 print(FastVector.cosine_similarity(en_vector, zh_vector)) zh_words = set(zh_dictionary.word2id.keys()) en_words = set(en_dictionary.word2id.keys()) overlap = list(zh_words & en_words) bilingual_dictionary = [(entry, entry) for entry in overlap] # form the training matrices source_matrix, target_matrix = make_training_matrices(en_dictionary,
def prepare_data(data_raw, labels_raw, params, data_path): # get embeddings, prepare data print("building dictionary") data_dict = Dictionary(data_raw, labels_raw, params.vocab_drop) save_data(data_dict.sentences, "./trained_embeddings_" + params.name + "/sentences_mod.pickle", os.path.join(data_path, 'data_mod.txt')) save_data(data_dict.labels, "./trained_embeddings_" + params.name + "/labels_mod.pickle", os.path.join(data_path, 'labels_mod.txt')) sizes = data_dict.sizes b1 = sizes[0] b2 = sizes[0] + sizes[1] b3 = sizes[0] + sizes[1] + sizes[2] model_path = "./trained_embeddings_" + params.name filename = os.path.join(model_path, "embedding_file.pkl") if os.path.exists(filename): with open(filename, 'r') as rf: embed_arr = pickle.load(rf) else: hi_align_dictionary = FastVector( vector_file='/home/bidisha/sharmila/wiki.hi.align.vec') en_align_dictionary = FastVector( vector_file='/home/bidisha/sharmila/wiki.en.align.vec') print("loaded the files..") embed_arr = embed_arr = np.zeros( [data_dict.vocab_size, params.embed_size]) for i in range(embed_arr.shape[0]): print(i) if i == 0: continue elif (i > 0 and i < b1): try: embed_arr[i] = en_align_dictionary[data_dict.idx2word[i]] print(str(i), "english") except: pass try: embed_arr[i] = hi_align_dictionary[data_dict.idx2word[i]] print(str(i), "hindi") except: embed_arr[i] = hi_align_dictionary["unk"] print(str(i), "unk") elif (i >= b1 and i < b2): try: embed_arr[i] = en_align_dictionary[data_dict.idx2word[i]] print(str(i), "english") except: embed_arr[i] = hi_align_dictionary["unk"] print(str(i), "unk") elif (i >= b2 and i < b3): try: embed_arr[i] = hi_align_dictionary[data_dict.idx2word[i]] print(str(i), "hindi") except: embed_arr[i] = hi_align_dictionary["unk"] print(str(i), "unk") print("Embedding created") if not os.path.exists(model_path): os.makedirs(model_path) with open(filename, 'w') as wf: pickle.dump(embed_arr, wf) # if params.pre_trained_embed: # w2_vec = train_w2vec(params.input, params.embed_size, # w2vec_it=5, # sentences=data_dict.sentences, # model_path="./trained_embeddings_"+params.name) # embed_arr = np.zeros([data_dict.vocab_size, params.embed_size]) # for i in range(embed_arr.shape[0]): # if i == 0: # continue # try: # embed_arr[i] = w2_vec.word_vec(unicode(data_dict.idx2word[i], "utf-8")) # # print(data_dict.idx2word[i]) # except: # ax=2 # # embed_arr[i] = w2_vec.word_vec('<unk>') data = [[data_dict.word2idx[word] \ for word in sent[:-1]] for sent in data_dict.sentences \ if len(sent) < params.sent_max_size - 2] encoder_data = [[data_dict.word2idx[word] \ for word in sent[1:]] for sent in data_dict.sentences \ if len(sent) < params.sent_max_size - 2] decoder_labels = [] for sent in data_dict.sentences: a = [] for word in sent[1:]: index = data_dict.word2idx[word] if (index >= b1 and index < b2): a.append(index - b1) elif (index >= b2): a.append(index - b2) else: a.append(index) decoder_labels.append(a) # for i in range(5): # print(encoder_data[i]) # print(decoder_labels[i]) # print("------------------") # exit() filename = os.path.join(model_path, "data_dict.pkl") with open(filename, 'w') as wf: pickle.dump(data_dict, wf) print("----Corpus_Information--- \n " "Raw data size: {} sentences \n Vocabulary size {}" "\n Limited data size {} sentences \n".format( len(data_raw), data_dict.vocab_size, len(data))) return data, encoder_data, decoder_labels, embed_arr, data_dict
embeddingsmodel0=loadtransfasttextmodel('Path To Vectors') vecten=[] lng = 'en' for word in embeddingsmodel0[lng].id2word: vecten.append(embeddingsmodel0[lng][word]) #.reshape(-1,300)[0] vectes=[] lng = 'es' for word in embeddingsmodel0[lng].id2word: vectes.append(embeddingsmodel0[lng][word]) lng = 'ar' vectar=[] embeddingsmodel0[lng] = FastVector(vector_file=filename+'wiki.'+lng+'.vec') for word in embeddingsmodel0[lng].id2word: vectar.append(embeddingsmodel0[lng][word]) vectar=np.asarray(vectar) vecten=np.asarray(vecten) vectes=np.asarray(vectes) #stanford_ner_path = '/home/ahmad/nltk_data/stanford/stanford-ner.jar' #os.environ['CLASSPATH'] = stanford_ner_path #stanford_classifier = "/home/ahmad/nltk_data/stanford/es/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz" #stes = StanfordNERTagger(stanford_classifier) #stanford_classifier = '/home/ahmad/nltk_data/stanford/english.all.3class.distsim.crf.ser.gz' #sten = StanfordNERTagger(stanford_classifier) #stanford_classifier = "/home/ahmad/nltk_data/stanford/de/edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz" #stde = StanfordNERTagger(stanford_classifier)
source_matrix = normalized(source_matrix) target_matrix = normalized(target_matrix) # perform the SVD product = np.matmul(source_matrix.transpose(), target_matrix) U, s, V = np.linalg.svd(product) # return orthogonal transformation which aligns source language to the target return np.matmul(U, V) # Now we load the French and Russian word vectors, and evaluate the similarity of "chat" and "кот": # In[2]: fr_dictionary = FastVector(vector_file='zh_vec.txt') ru_dictionary = FastVector(vector_file='en_vec.txt') fr_vector = fr_dictionary["chat"] ru_vector = ru_dictionary["кот"] print(FastVector.cosine_similarity(fr_vector, ru_vector)) # "chat" and "кот" both mean "cat", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, we need a bilingual dictionary of French and Russian translation pairs. As it happens, this is a great opportunity to show you something truly amazing... # # Many words appear in the vocabularies of more than one language; words like "alberto", "london" and "presse". These words usually mean similar things in each language. Therefore we can form a bilingual dictionary, by simply extracting every word that appears in both the French and Russian vocabularies. # In[3]: ru_words = set(ru_dictionary.word2id.keys()) fr_words = set(fr_dictionary.word2id.keys()) overlap = list(ru_words & fr_words)
from fasttext import FastVector import json ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec') en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec') print("loaded the dictionaries") ja_dic.apply_transform('alignment_matrices/ja.txt') en_dic.apply_transform('alignment_matrices/en.txt') print("transformed the dictionaries") en_word_list = [ "cat", "dog", "apple", "car", "train", "school", "student", "teacher" ] ja_word_list = ["猫", "犬", "りんご", "車", "電車", "学校", "生徒", "先生"] result_f = open("multi_fast.txt", "w") result = {} # Ja_word_list 10 nearest neighbor for ja_word in ja_word_list: en_words = en_dic.translate_k_nearest_neighbour(ja_dic[ja_word], k=20) result[ja_word] = en_words resut_str = ",".join(result[ja_word]) result_f.write(ja_word + "," + resut_str + "\n") # En_word_list 10 nearest neighbor for en_word in en_word_list: ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=20) result[en_word] = ja_words resut_str = ",".join(result[en_word]) result_f.write(en_word + "," + resut_str + "\n")
else: doc_emb = np.zeros([len(df), 300]) doc = df['data'].apply(lambda x: x.lower().split()) for idx, sent in enumerate(doc): for word in sent: if word in lang_vec.word2id.keys(): doc_emb[idx] += lang_vec[word] return doc_emb, tfidfvec if __name__ == '__main__': args = parse_args() print('loading vectors') en_dictionary = FastVector(vector_file=args.en_embedding) fr_dictionary = FastVector(vector_file=args.fr_embedding) #print('transforming vectors') #fr_dictionary.apply_transform('alignment_matrices/fr.txt') #print('CCA...') #en_fr = read_dictionary(args.embedding_path+'en_fr.txt') #en_dictionary.embed, fr_dictionary.embed = cca(en_dictionary, fr_dictionary, en_fr, dim=250) print( "Hello score:", FastVector.cosine_similarity(en_dictionary["hello"], fr_dictionary["bonjour"])) print('processing data')
synonyms_file = args.synonyms lang = args.lang skip_lang = args.skip_lang out_folder = args.output pairs_file = args.test ### Create synonyms dictionary synonyms_dict = dict() syn_file = open(synonyms_file, "r") lines = syn_file.readlines() for line in lines: line = re.sub(r'\n', '', line) w1, w2 = line.split('\t') synonyms_dict[w1] = w2.split(',') fr_dictionary = FastVector(vector_file=args.lang_vec) it_dictionary = FastVector(vector_file=args.lang_p_vec) # Start out_file_name = out_folder + "/skip" + skip_lang + "_" + lang + ".txt" if not os.path.exists(out_folder): os.makedirs(out_folder) out_file = codecs.open(out_file_name, 'w', "utf-8") vec_file_name = re.sub(r'\.txt', '', out_file_name) vec_file_name = vec_file_name + "_vec.txt" vec_file = codecs.open(vec_file_name, 'w', "utf-8")
def test_word(en_dictionary, other_dictionary, SRC_WORD, TGT_WORD): print "Testing WORD[%s->%s]" % (SRC_WORD, TGT_WORD) en_vector = en_dictionary[SRC_WORD] other_vector = other_dictionary[TGT_WORD] print(FastVector.cosine_similarity(en_vector, other_vector))
f = codecs.open("Qe", "w") for word in en_dict: pro = " ".join(en_dict[word]) f.write(word + " " + pro + '\n') print "Readling Dictionary" BI_DICT = codecs.open("o.s2t_f", "r").readlines() BI_DICT = parse_BI(BI_DICT) print "Readling Dictionary (END)" # SRC_WORD = "昨天" # TGT_WORD = "yesterday" SRC_WORD = "钥匙" TGT_WORD = "keys" en_dictionary = FastVector(vector_file='en.emb.orig.vec') other_dictionary = FastVector(vector_file='tizh.emb.orig.vec') test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) # form the training matrices print "Learning SVD" source_matrix, target_matrix = make_training_matrices(other_dictionary, en_dictionary, BI_DICT) # learn and apply the transformation transform = learn_transformation(source_matrix, target_matrix) other_dictionary.apply_transform(transform) # zh test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD) # ti
#coding=utf-8 from fasttext import FastVector fr_dictionary = FastVector(vector_file='wiki.en.vec') fr_dictionary.export('fr.vec.txt')
losses.append(gap) return w, l, losses if __name__ == "__main__": # load the datasets and perform split into training and test set dir = os.path.join(os.getcwd(), "expcode", "numerical_code") en_corpus = pickle.load(open(os.path.join(dir, 'english_vocab.pkl'), 'rb'))[:100] # CHANGE THIS WHEN WE HAVE DB fr_corpus = pickle.load(open(os.path.join(dir, 'french_vocab.pkl'), 'rb'))[:100] # CHANGE THIS WHEN WE HAVE DB # load the counts and co-occurences en_dict = FastVector( vector_file='/Users/williamst-arnaud/Downloads/cc.en.300.vec') fr_dict = FastVector( vector_file='/Users/williamst-arnaud/Downloads/cc.fr.300.vec') en_dict.apply_transform( '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/en.txt' ) fr_dict.apply_transform( '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/fr.txt' ) # number of items in dataset n = len(en_corpus) start = time.time() w, l, losses = train(en_corpus,
word vectors from the bilingual dictionary. """ # optionally normalize the training vectors if normalize_vectors: source_matrix = normalized(source_matrix) target_matrix = normalized(target_matrix) # perform the SVD product = np.matmul(source_matrix.transpose(), target_matrix) U, s, V = np.linalg.svd(product) # return orthogonal transformation which aligns source language to the target return np.matmul(U, V) lang1_dictionary = FastVector(vector_file=args.lang1) lang2_dictionary = FastVector(vector_file=args.lang2) bilingual_dictionary = [] file_object = open(args.dict, "r") lines = file_object.readlines() for line in lines: line = re.sub(r'\n', '', line) w_lang2, w_lang1 = line.split('\t') if w_lang1 in lang1_dictionary.word2id.keys( ) and w_lang2 in lang2_dictionary.word2id.keys(): bilingual_dictionary.append(tuple((w_lang2, w_lang1))) print("Dic Size: " + str(len(bilingual_dictionary))) # form the training matrices# form
def train(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tar, source='tgt') train_data_src_mono = read_corpus(args.train_src_mono, source='src') train_data_tgt_mono = read_corpus(args.train_tar_mono, source='tgt') dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tar, source='tgt') train_data = list( zip(train_data_src, train_data_tgt, train_data_src_mono, train_data_tgt_mono)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args.batch_size) valid_niter = int(args.valid_iter) log_every = int(args.log_every) model_save_path = args.save_path "Vocab dict" vocab_src = pickle.load(open(args.vocab_src, 'rb')) vocab_tar = pickle.load(open(args.vocab_tar, 'rb')) "Optimizer params" s2s_param = [] t2t_param = [] s2t_param = [] t2s_param = [] "Embed" # pretrained (and fixed), cross-lingual embeddings args.embed_size = 300 from fasttext import FastVector src_embed_path = 'embed/' + args.embed_src tar_embed_path = 'embed/' + args.embed_tar try: vectors_src = pickle.load(open(src_embed_path, 'rb')) except FileNotFoundError: vectors_src = FastVector(vector_file=args.embed_src) vectors_src.apply_transform(args.embed_alignment) pickle.dump(vectors_src, open(src_embed_path, 'wb+')) try: vectors_tar = pickle.load(open(tar_embed_path, 'rb')) except FileNotFoundError: vectors_tar = FastVector( vector_file=args.embed_tar) # tar is en, no alignment required pickle.dump(vectors_tar, open(tar_embed_path, 'wb+')) src2embed = lambda word: torch.FloatTensor(vectors_src[ word]) if word in vectors_src else torch.zeros(300) tar2embed = lambda word: torch.FloatTensor(vectors_tar[ word]) if word in vectors_tar else torch.zeros(300) embedder_src = Embedder( vocab_src.dict_size(), args.embed_size, nn.Embedding.from_pretrained(torch.stack([ src2embed(word.lower() if word is not None else word) for word in vocab_src.id2word ], dim=0), freeze=True)) embedder_tar = Embedder( vocab_tar.dict_size(), args.embed_size, nn.Embedding.from_pretrained(torch.stack([ tar2embed(word.lower() if word is not None else word) for word in vocab_tar.id2word ], dim=0), freeze=True)) "Generator" gen_src = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda() gen_src_wrapper = WrapperEmbeddingGenerator(gen_src, embedder_src).cuda() gen_tar = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda() gen_tar_wrapper = WrapperEmbeddingGenerator(gen_tar, embedder_tar).cuda() if args.gen_src != "": gen_src_wrapper.load_weight(args.gen_src) else: [s2s_param, s2t_param, t2s_param] = add_to_optimizer(gen_src_wrapper, [s2s_param, s2t_param, t2s_param]) if args.gen_tar != "": gen_tar_wrapper.load_weight(args.gen_tar) else: [s2t_param, t2s_param, t2t_param] = add_to_optimizer(gen_tar_wrapper, [s2t_param, t2s_param, t2t_param]) if args.multi_gpu: gen_src_wrapper = nn.DataParallel(gen_src_wrapper, device_ids=[0, 1]) gen_tar_wrapper = nn.DataParallel(gen_tar_wrapper, device_ids=[0, 1]) "encoder" # shared encoder encoder = GRUEncoder(args.embed_size, args.hidden_size, bidirectional=args.encoder_bidir, layers=args.encoder_layer, dropout=args.dropout).cuda() if args.multi_gpu: encoder = nn.DataParallel(encoder, device_ids=[0, 1]) [s2s_param, s2t_param, t2s_param, t2t_param ] = add_to_optimizer(encoder, [s2s_param, s2t_param, t2s_param, t2t_param]) "Decoder" decoder_src = AttentionDecoder(args.embed_size, args.hidden_size, 1, args.dropout, input_feed=True).cuda() decoder_tar = AttentionDecoder(args.embed_size, args.hidden_size, 1, args.dropout, input_feed=True).cuda() if args.multi_gpu: decoder_src = nn.DataParallel(decoder_src, device_ids=[0, 1]) decoder_tar = nn.DataParallel(decoder_tar, device_ids=[0, 1]) [s2s_param, s2t_param, t2s_param] = add_to_optimizer(decoder_src, [s2s_param, s2t_param, t2s_param]) [s2t_param, t2s_param, t2t_param] = add_to_optimizer(decoder_tar, [s2t_param, t2s_param, t2t_param]) "Translators" s2s_model = MT(vocab_src, vocab_src, embedder_src, embedder_src, gen_src_wrapper, encoder, decoder_src, denoising=True, multi_gpu=args.multi_gpu) t2t_model = MT(vocab_tar, vocab_tar, embedder_tar, embedder_tar, gen_tar_wrapper, encoder, decoder_tar, denoising=True, multi_gpu=args.multi_gpu) s2t_model = MT(vocab_src, vocab_tar, embedder_src, embedder_tar, gen_tar_wrapper, encoder, decoder_tar, denoising=False, multi_gpu=args.multi_gpu) t2s_model = MT(vocab_tar, vocab_src, embedder_tar, embedder_src, gen_src_wrapper, encoder, decoder_src, denoising=False, multi_gpu=args.multi_gpu) "optimizers" s2s_optimizer = torch.optim.Adam(s2s_param, lr=args.lr) t2t_optimizer = torch.optim.Adam(t2t_param, lr=args.lr) s2t_optimizer = torch.optim.Adam(s2t_param, lr=args.lr) t2s_optimizer = torch.optim.Adam(t2s_param, lr=args.lr) def save_model(): # save embedder if args.embed_src == "": embedder_src.save_weight(args.save_path + "/embed_src.bin") if args.embed_tar == "": embedder_tar.save_weight(args.save_path + "/embed_tar.bin") # save generator if args.gen_src == "": gen_src_wrapper.save_weight(args.save_path + "/gen_src.bin") if args.gen_tar == "": gen_tar_wrapper.save_weight(args.save_path + "/gen_tar.bin") # save encoder encoder.save_weight(args.save_path + "/encoder.bin") # save decoder decoder_src.save_weight(args.save_path + "/decoder_src.bin") decoder_tar.save_weight(args.save_path + "/decoder_tar.bin") # save optimizer print("all models saved") def train_step(mt, optimizer, src_sents, tar_sents): optimizer.zero_grad() loss = mt.get_loss(src_sents, tar_sents, train=True) loss += loss.data[0] res = loss.cpu().detach().item() loss.div(args.batch_size).backward() optimizer.step() return res def train_step_backtranslate(mt, optimizer, src_sents, max_ratio): tar_sents = mt.greedy(src_sents, max_ratio, mode=False) res = train_step(mt, optimizer, src_sents, tar_sents) return res num_trial = 0 train_iter = patience = cum_loss = report_loss = cumulative_tgt_words = report_tgt_words = 0 cumulative_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') if not os.path.exists(args.save_path): os.mkdir(args.save_path) while True: epoch += 1 for src_sents, tgt_sents, src_mono_sents, tgt_mono_sents in batch_iter( train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 batch_size = len(src_sents) srclen = max(map(len, src_sents)) tar_len = max(map(len, tgt_sents)) print("SRCLEN {} TARLEN {}".format(srclen, tar_len)) model = s2t_model # (batch_size) train_step(s2s_model, s2s_optimizer, src_sents, src_sents) print("finish s2s") train_step(t2t_model, t2t_optimizer, tgt_sents, tgt_sents) print("finish t2t") train_step(s2s_model, s2s_optimizer, src_mono_sents, src_sents) print("finish s2s mono") train_step(t2t_model, t2t_optimizer, tgt_mono_sents, tgt_sents) print("finish t2t mono") train_step(t2s_model, t2s_optimizer, tgt_sents, src_sents) print("finish t2s") loss = train_step(model, s2t_optimizer, src_sents, tgt_sents) print("finish s2t") train_step_backtranslate(s2t_model, s2t_optimizer, src_sents, (tar_len / srclen)) print("finish s2t back") train_step_backtranslate(t2s_model, t2s_optimizer, tgt_sents, (srclen / tar_len)) print("finish t2s back") train_step_backtranslate(s2t_model, s2t_optimizer, src_mono_sents, (tar_len / srclen)) print("finish s2t back mono") train_step_backtranslate(t2s_model, t2s_optimizer, tgt_mono_sents, (srclen / tar_len)) print("finish t2s back mono") os.system("nvidia-smi") report_loss += loss cum_loss += loss tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cumulative_tgt_words += tgt_words_num_to_predict report_examples += batch_size cumulative_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cumulative_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # the following code performs validation on dev set, and controls the learning schedule # if the dev score is better than the last check point, then the current model is saved. # otherwise, we allow for that performance degeneration for up to `--patience` times; # if the dev score does not increase after `--patience` iterations, we reload the previously # saved best model (and the state of the optimizer), halve the learning rate and continue # training. This repeats for up to `--max-num-trial` times. if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cumulative_examples, np.exp(cum_loss / cumulative_tgt_words), cumulative_examples), file=sys.stderr) cum_loss = cumulative_examples = cumulative_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = model.evaluate_ppl( dev_data, batch_size=args.batch_size ) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) save_model() # if is_better: # patience = 0 # print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) # model.save(model_save_path) # # You may also save the optimizer's state # elif patience < int(args.patience): # patience += 1 # print('hit patience %d' % patience, file=sys.stderr) # if patience == int(args.patience): # num_trial += 1 # print('hit #%d trial' % num_trial, file=sys.stderr) # if num_trial == int(args.max_num_trail): # print('early stop!', file=sys.stderr) # exit(0) # # decay learning rate, and restore from previously best checkpoint # lr = lr * float(args.lr_decay) # print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # # load model # model_save_path # print('restore parameters of the optimizers', file=sys.stderr) # # You may also need to load the state of the optimizer saved before # # reset patience # patience = 0 if epoch == int(args.max_epoch): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def main(): vec = FastVector('thin_2018-11-23_d100_e5.bin.vec') iris = vec.word2id.keys() cond2vec = MedicalCond2Vector(iris) cond2vec.save_model('thin_to_word2vec.bin.vec') print(cond2vec.vector)
def load_embeddings_dict(self, language): vector_file = path.join(self.embs_dir + language, language + '.vec') dictionary = FastVector(vector_file=vector_file) return dictionary