def load_bin_embeddings(args, source): """ Reload pretrained embeddings from a fastText binary file. """ # load fastText binary file lang = args.src_lang if source else args.tgt_lang # remove stop words out of these top words max_vocab = args.vocab_size model = load_fasttext_model(args.src_emb_path if source else args.tgt_emb_path) words, freqs = model.get_labels(include_freq=True) assert model.get_dimension() == args.emb_dim print("Loaded binary model. Generating embeddings ...") embeddings = np.concatenate([model.get_word_vector(w)[None] for w in words], 0) print("Generated embeddings for %i words." % len(words)) assert embeddings.shape == (len(words), args.emb_dim) # select a subset of word embeddings (to deal with casing) # stop words might have been removed from freqs and train_indexes word2id, indexes, freqs = select_subset(words, max_vocab, freqs) embeddings = embeddings[indexes] id2word = {i: w for w, i in word2id.items()} dico = Dictionary(id2word, word2id, lang) assert embeddings.shape == (len(dico), args.emb_dim) print("Number of words in {} = {}".format(lang, len(dico))) return embeddings, dico
def count_vocab(data_file, max_vcb_size): vocab = Dictionary() with open(data_file, 'r') as f: for sent in f.readlines(): sent = sent.strip() for word in sent.split(): vocab.add(word) # vocab.write_into_file('all.vocab') words_cnt = sum(vocab.freq.itervalues()) new_vocab, new_words_cnt = vocab.keep_vocab_size(max_vcb_size) wlog('|Final vocabulary| / |Original vocabulary| = {} / {} = {:4.2f}%' .format(new_words_cnt, words_cnt, (new_words_cnt/words_cnt) * 100)) return new_vocab
def read_txt_embeddings(args, source, full_vocab): """ Reload pretrained embeddings from a text file. """ word2id = {} vectors = [] # load pretrained embeddings lang = args.src_lang if source else args.tgt_lang emb_path = args.src_emb_path if source else args.tgt_emb_path _emb_dim_file = args.emb_dim with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f: for i, line in enumerate(f): if i == 0: split = line.split() assert len(split) == 2 assert _emb_dim_file == int(split[1]) else: word, vect = line.rstrip().split(' ', 1) if not full_vocab: word = word.lower() vect = np.fromstring(vect, sep=' ') if np.linalg.norm(vect) == 0: # avoid to have null embeddings vect[0] = 0.01 if word in word2id: if full_vocab: print("Word '%s' found twice in %s embedding file" % (word, 'source' if source else 'target')) else: if not vect.shape == (_emb_dim_file, ): print( "Invalid dimension (%i) for %s word '%s' in line %i." % (vect.shape[0], 'source' if source else 'target', word, i)) continue assert vect.shape == (_emb_dim_file, ), i word2id[word] = len(word2id) vectors.append(vect[None]) if args.max_vocab > 0 and len( word2id) >= args.max_vocab and not full_vocab: break assert len(word2id) == len(vectors) print("Loaded %i pre-trained word embeddings." % len(vectors)) # compute new vocabulary / embeddings id2word = {v: k for k, v in word2id.items()} dico = Dictionary(id2word, word2id, lang) embeddings = np.concatenate(vectors, 0) # embeddings = torch.from_numpy(embeddings).float() assert embeddings.shape == (len(dico), args.emb_dim) return dico, embeddings
def load_embeddings(args, source: bool): """ Reload pretrained embeddings from a fastText binary file or word2vec file. """ lang = args.src_lang if source else args.tgt_lang max_vocab = args.max_vocab emb_path = args.src_emb_path if source else args.tgt_emb_path if emb_path.endswith('.bin'): model = load_fasttext_model(emb_path) words, freqs = model.get_labels(include_freq=True) assert model.get_dimension() == args.emb_dim print("Loaded binary model. Generating embeddings ...") embeddings = np.concatenate( [model.get_word_vector(w)[None] for w in words], 0) elif emb_path.endswith(('.vec', '.vec.gz')): words, embeddings = read_txt_embeddings(args, source, True) freqs = load_txt_counts(args, source, words) else: raise Exception('Unknown embeddings file format: "%s"' % emb_path) print("Generated embeddings for %i words." % len(words)) assert embeddings.shape == (len(words), args.emb_dim) # select a subset of word embeddings (to deal with casing) # stop words might have been removed from freqs and train_indexes word2id, indexes, freqs = select_subset(words, max_vocab, freqs, lang=lang) word_dist = None if 'smooth_c' in args: # smooth the frequency word_dist = cal_empiral_freqs(np.array(freqs), args.smooth_c) # remove stop words out of these top words if 'src_train_most_frequent' in args and 'tgt_train_most_frequent' in args: mf = args.src_train_most_frequent if source else args.tgt_train_most_frequent if mf > 0: word_dist = word_dist[:mf] / word_dist[:mf].sum() embeddings = embeddings[indexes] id2word = {i: w for w, i in word2id.items()} # create the dictionary dico = Dictionary(id2word, word2id, lang, word_dist) assert embeddings.shape == (len(dico), args.emb_dim) print(f"Number of words in {lang} = {len(dico)}") if 'smooth_c' in args: print("Max frequency = %.7f, min frequency = %.7f" % (max(word_dist), min(word_dist))) return dico, embeddings, word_dist
def count_vocab(data_file, max_vcb_size, max_seq_len=50, char=False): assert data_file and os.path.exists(data_file), 'need file to extract vocabulary ...' vocab = Dictionary() #with open(data_file, 'r') as f: with io.open(data_file, encoding='utf-8') as f: for sent in f.readlines(): sent = sent.strip() if char is True: words = zh_to_chars(sent) else: words = sent.split() if len(words) > max_seq_len: continue for word in words: vocab.add(word) words_cnt = sum(vocab.freq.itervalues()) new_vocab, new_words_cnt = vocab.keep_vocab_size(max_vcb_size) wlog('|Final vocabulary| / |Original vocabulary| = {} / {} = {:4.2f}%' .format(new_words_cnt, words_cnt, (new_words_cnt/words_cnt) * 100)) return new_vocab
def extract_vocab(data_file, vocab_file, max_vcb_size=30000, max_seq_len=50): if os.path.exists(vocab_file) is True: # If vocab file has been exist, we load word dictionary wlog('Load dictionary from file {}'.format(vocab_file)) vocab = Dictionary() vocab.load_from_file(vocab_file) else: vocab = count_vocab(data_file, max_vcb_size, max_seq_len) vocab.write_into_file(vocab_file) wlog('Save dictionary file into {}'.format(vocab_file)) return vocab
def load_bin_embeddings(args, source: bool): """ Reload pretrained embeddings from a fastText binary file. """ # reload fastText binary file lang = args.src_lang if source else args.tgt_lang # remove stop words out of these top words mf = args.src_train_most_frequent if source else args.tgt_train_most_frequent max_vocab = args.max_vocab model = load_fasttext_model( args.src_emb_path if source else args.tgt_emb_path) words, freqs = model.get_labels(include_freq=True) assert model.get_dimension() == args.emb_dim print("Loaded binary model. Generating embeddings ...") embeddings = np.concatenate( [model.get_word_vector(w)[None] for w in words], 0) print("Generated embeddings for %i words." % len(words)) assert embeddings.shape == (len(words), args.emb_dim) # select a subset of word embeddings (to deal with casing) # stop words might have been removed from freqs and train_indexes word2id, indexes, freqs = select_subset(words, max_vocab, freqs, lang=lang) # smooth the frequency word_dist = cal_empiral_freqs(np.array(freqs), args.smooth_c) embeddings = embeddings[indexes] id2word = {i: w for w, i in word2id.items()} if mf > 0: word_dist = word_dist[:mf] / word_dist[:mf].sum() dico = Dictionary(id2word, word2id, lang, word_dist) assert embeddings.shape == (len(dico), args.emb_dim) print(f"Number of words in {lang} = {len(dico)}", len(word_dist)) print("Max frequency = %.7f, min frequency = %.7f" % (max(word_dist), min(word_dist))) return dico, embeddings, word_dist