def embedding(emb_type): if emb_type == 'w2v': model = KeyedVectors.load_word2vec_format(path_to_w2v) if emb_type == 'fasttext': model = FastText.load_fasttext_format(path_to_fasttext_emb) if emb_type == 'fasttext_2': print('loading fasttext embedding...') model = FastText.load_fasttext_format(path_to_fasttext_emb_2) print('Done!') if emb_type == 'fasttext_unlem': model = FastText.load_fasttext_format(path_to_fasttext_unlem) return model
def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, neighbors: int = DEFAULT_NEIGHBORS_NUMBER, edit_candidates: int = DEFAULT_EDIT_DISTANCE, max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS, max_corrected_length: int = 12) -> None: """ Construct correction candidates generator. :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \ First token in every line split is added to the vocabulary. :param frequencies_file: Path to the text file with frequencies. Each line must be two \ values separated with a whitespace: "token count". :param embeddings_file: Path to the dump of FastText model. :param neighbors: Number of neighbors of context and typo embeddings \ to consider as candidates. :param edit_candidates: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as candidates. :param max_distance: Maximum edit distance for symspell lookup for candidates. :param radius: Maximum edit distance from typo allowed for candidates. :param max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted """ self.checker = SymSpell(max_dictionary_edit_distance=max_distance, prefix_length=max_corrected_length) self.checker.load_dictionary(vocabulary_file) self.wv = FastText.load_fasttext_format(embeddings_file).wv self.neighbors_number = neighbors self.edit_candidates_number = edit_candidates self.max_distance = max_distance self.radius = radius self.tokens = read_vocabulary(vocabulary_file) self.frequencies = read_frequencies(frequencies_file)
def load(self, path, model_type='word2vec'): """ Load pre-trained word embedding model and save it into embed_words.__embedding. Args: path (str): relative path to the file containing pre-trained model. model_type (str): type of the model - must be one of the following:'word2vec' for '.vec' files or 'fasttext' for '.bin' files. (Default = 'word2vec') """ # Code for loading Word2vec model: if model_type == 'word2vec': self.__model = KeyedVectors.load_word2vec_format(path) self.__embedding = self.__model.wv # Code for loading fastText model: elif model_type == 'fasttext': self.__model = FastText.load_fasttext_format(path) self.__embedding = self.__model.wv # In case we're trying to load an unsupported model type: else: raise Exception( "Model '{}' not supported (must be 'word2vec' or 'fasttext').". format(model_type) + " Cannot load word embedding model.")
def make_word_embedding(word_dict, word_emb_pkl_path=params['default_word_emb_pkl_path'], fasttext_path=params['default_fasttext_path']): word_emb = np.zeros([len(word_dict), params['word_emb_dim']]) if os.path.isfile(word_emb_pkl_path): with open(word_emb_pkl_path, 'rb') as f: word_emb = pickle.load(f) print('Existed trained word embedding loaded') else: #load fasttext model fasttext_model = FastText.load_fasttext_format(fasttext_path, encoding='utf8') print('No word_emb pkl file, start making word_emb ...') for word, idx in word_dict.items(): if idx == 0: # PAD = 0 continue else: try: word_emb[idx] = np.asarray(fasttext_model.wv[word]) except KeyError: # if there is no word vector for certain word, just assign random vector word_emb[idx] = np.random.uniform(-0.25, 0.25, params['word_emb_dim']) with open(word_emb_pkl_path, 'wb') as f: pickle.dump(word_emb, f) print('Making word_emb ... Done and Saved') return word_emb
def create_data_and_labels_using_fasttext_embeddings(dataset_path, fasttext_model_path, embedding_size=300, language='turkish'): print("Loading raw sentences and one hot encoded labels") sentences, labels = load_data_and_labels(dataset_path) print("Finding maximum sentence length in the dataset") max_sentence_length = max([len(sentence.split(" ")) for sentence in sentences]) print("Loading Fasttext model") model = FastText.load_fasttext_format(fasttext_model_path) sentence_embedding_list = list() print("Transform raw sentences to Fasttext embeddings") count = 0 for sentence_idx, sentence in enumerate(sentences): tokens = sentence.split(" ") sentence_embedding = np.zeros(shape=(max_sentence_length, embedding_size)) for idx, word in enumerate(tokens): try: sentence_embedding[idx] = model[word] except KeyError: print("Sentence:", sentence, "- Error word:", word) sentence_embedding_list.append(sentence_embedding) if sentence_idx % 100000 == 0: outputPath = "D:/PycharmProjects/TextCategorization/FasttextEmbeddingsWithNGrams/TWNERTC_TC_Coarse Grained NER_No_NoiseReduction_" + str(count) np.save(outputPath, np.asarray(sentence_embedding_list)) sentence_embedding_list.clear() count += 1
def load_model(self): if self.backend == 'spacy': if self.trained_vectors == 'en': import en_core_web_sm self.model = en_core_web_sm.load() else: self.model = sp.load(self.trained_vectors) elif self.backend == 'gensim': if self.mode is None: self.mode = 'glove' if self.mode == 'glove': glove2word2vec(self.trained_vectors, self.temp) self.model = KeyedVectors.load_word2vec_format(self.temp, binary=self.binary) os.remove(self.temp) elif self.mode == 'word2vec': self.model = KeyedVectors.load_word2vec_format(self.trained_vectors, binary=self.binary, encoding='latin-1') elif self.mode == 'fasttext': self.model = FastText.load_fasttext_format(self.trained_vectors, encoding='latin-1')
def read_fasttext(self, file): """ Create an Embeddings Matrix, in which each row corresponds to the word vector from the pretrained word embeddings. If a word is missing then obtain a representation on-the-fly using fasttext. Args: file: dim: Returns: """ model = FastText.load_fasttext_format(file) embeddings = numpy.zeros((len(self), model.vector_size)) missing = [] for token_id, token in tqdm(self.id2tok.items(), desc="Reading embeddings...", total=len(self.id2tok.items())): if token not in model.wv.vocab: missing.append(token) embeddings[token_id] = model[token] print(f"Missing tokens from the pretrained embeddings: {len(missing)}") return embeddings, missing
def load_wordvectors(load_vec_file=False): """ Loads and returns fastText word embedding. :param load_vec_file: Bool indicating whether to load `.txt` or `.bin` file. :return word_embeddings: fastText word embedding. :rtype: map(str): array_type """ if load_vec_file: """From https://fasttext.cc/docs/en/english-vectors.html""" fname = './Data/wiki-news-300d-1M.vec' fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') n, d = map(int, fin.readline().split()) word_embeddings = {} for line in fin: tokens = line.rstrip().split(' ') word_embeddings[tokens[0]] = map(float, tokens[1:]) else: from gensim.models import FastText word_embeddings = FastText.load_fasttext_format('./Data/cc.en.300.bin') return word_embeddings
def __getitem__(self, key): if key not in self: path = self.wv_path.format(key) print("Loading FastText for", key) value = ft.load_fasttext_format(path) self[key] = value return super().__getitem__(key)
def build_emb(self, vectors, vocab, embdim=300, model_path=None): if model_path: ft = FastText.load_fasttext_format(model_path) mat = [] no_vectors = {} embedding_matrix = np.zeros((len(vocab.keys()) + 1, embdim)) c = 0 for i, (word, idx) in enumerate(vocab.items()): if model_path: try: vect = ft[word] except: vect = None else: vect = vectors.get(word) if vect is not None and len(vect) > 0: embedding_matrix[i] = vect c += 1 else: no_vectors[word] = idx print("{} words were found over a vocab of {} which is a ratio of {}"\ .format(c, len(vocab.items()), round(c/len(vocab), 2) )) return embedding_matrix, no_vectors
def test_get_fasttext_model(self): data = pandas.read_csv(str(TEST_DATA_DIR / "prepared_data.csv.xz"), index_col=0, keep_default_na=False) with tempfile.TemporaryDirectory(prefix="lookout_typos_fasttext_") as temp_dir: config = {"size": 100, "path": os.path.join(temp_dir, "ft.bin"), "dim": 5} train_fasttext(data, config) model = FastText.load_fasttext_format(config["path"]) self.assertTupleEqual(model.wv["get"].shape, (5,))
def loadfasttext(embfile): """ Load fasttext embeddings :param embfile: :return: """ model = fText.load_fasttext_format(embfile) return model
def make_embedding(extra_word): with open('./data/data/save_data.test.dict.trans.cz', 'r', encoding='utf-8') as f: lines_test = f.readlines() # w2vModel = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True) fastTextModel = FastText.load_fasttext_format('./wiki.cs.bin') vocab_sz = len(lines_test) emb_dim = 300 weight_matrix_test = np.zeros((vocab_sz, 300)) count = 0 failed = [] for i, line in enumerate(lines_test): label = line.strip() # 之前还split[0]了,愚蠢啊 和dict文件的格式不一样的 try: weight_matrix_test[i] = fastTextModel[label] except: count += 1 failed.append(label) weight_matrix_test[i] = np.random.normal(size=(emb_dim, )) print('failed count {}, {}'.format(count, failed)) weight_matrix_tgt_np = torch.from_numpy( weight_matrix_test[:-1 * extra_word]).float() weight_matrix_test_np = torch.from_numpy(weight_matrix_test).float() with open('./data/data/save_data.src.dict', 'r', encoding='utf-8') as f: lines = f.readlines() vocab_sz = len(lines) emb_dim = 300 weight_matrix_src = np.zeros((vocab_sz, 300)) count = 0 failed = [] for i, line in enumerate(lines): label = line.strip().split()[0] try: weight_matrix_src[i] = fastTextModel[label] except: count += 1 failed.append(label) weight_matrix_src[i] = np.random.normal(size=(emb_dim, )) print('failed count {}, {}'.format(count, failed)) weight_matrix_src_np = torch.from_numpy(weight_matrix_src).float() weight_matrix_train = {} weight_matrix_train['src_emb'] = weight_matrix_src_np weight_matrix_train['tgt_emb'] = weight_matrix_tgt_np weight_matrix_test = {} weight_matrix_test['src_emb'] = weight_matrix_src_np weight_matrix_test['tgt_emb'] = weight_matrix_test_np torch.save(weight_matrix_train, './data/data/weight_matrix_train') torch.save(weight_matrix_test, './data/data/weight_matrix_test')
def convert_fasttext(model_path: str, output_path: str): print("Loading ...") model = FastText.load_fasttext_format(model_path) print("Saving ...") model.wv.save(output_path) print("Sanity check ...", end=" ") saved_model = KeyedVectors.load(output_path, mmap="r") np.testing.assert_allclose(saved_model["okej"], model.wv["okej"]) print("\u2713") # tick mark
def load_embeddings(args): if (args.fasttext): embeddings_dict = FastText.load_fasttext_format(args.fasttext) elif (args.emb): embeddings_dict = np.load(args.emb).item() else: print("Error - No embeddings specified") return embeddings_dict, len(embeddings_dict['the'])
def load_fasttext(): _fasttext = FastText.load_fasttext_format(path_fasttext) fasttext_dict = {} for word in tqdm(_fasttext.wv.vocab): fasttext_dict[word] = _fasttext[word] del _fasttext return fasttext_dict
def load_fasttext_embeddings(file: Union[Path, str]) -> FastTextKeyedVectors: """Load embeddings from file and unit normalize vectors.""" if isinstance(file, str): file = Path(file) # Detect the model format by its extension: if '.bin' in file.suffixes or '.vec' in file.suffixes: # Binary word2vec format emb_model = FastText.load_fasttext_format(str(file)) elif file.suffix == '.zip': # ZIP archive from the NLPL vector repository with zipfile.ZipFile(str(file), "r") as archive: model_file = archive.extract('parameters.bin') emb_model = FastText.load_fasttext_format(model_file) else: # Native Gensim format? emb_model = FastText.load(str(file)) # Unit-normalizing the vectors (if they aren't already) emb_model.init_sims(replace=True) return emb_model.wv
def load_model(self): print("Loading model...") if self.representation=='fasttext': model_path = "../crawl-300d-2M-subword.bin" self.word_model = FastText.load_fasttext_format(model_path,encoding='utf-8') if self.representation=='GloVe': from gensim.models import KeyedVectors model_path = "../glove.27B.100d.word2vec.txt" self.word_model = KeyedVectors.load_word2vec_format(model_path) self.w2v = dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0))
def get_wordvector(self): print("Downloading...") r = urlopen(self.zip_url) with BytesIO(r.read()) as b: print("Extracting...") with ZipFile(b) as z: with z.open(self.filename) as zf, open(self.path, "wb") as f: shutil.copyfileobj(zf, f) print("Loading...") model = ft.load_fasttext_format(self.path) os.remove(self.path) return model
def load(self, path): print("loading resources, this may take some minutes ... ") x = invertedIndex() self.idf_table = x.init(path) self.lexicon = x.lex self.n_docs = x.n_docs self.max_len = x.max_len y = Sentiment('../vader.txt') self.sent_table = y.sentiments self.senti_lex = y.sent_lex self.embeds = FastText.load_fasttext_format("../cc.en.300.bin") self.stopwords = set(stopwords.words('english'))
def fasttext_model_train(data, from_scratch): # Preprocessing like stopword removal @TODO ge_sentences = [ list(tokenize(s)) for s in data['text'].to_list()] if from_scratch: model = FastText(bucket= 1000000, window=3, min_count=1, size=300) model.build_vocab(sentences=ge_sentences) model.train(sentences=ge_sentences, total_examples=len(ge_sentences), epochs=10) else: model = FastText.load_fasttext_format('content/cc.en.300') model.build_vocab(ge_sentences, update=True) model.train(sentences=ge_sentences, total_examples = len(sent), epochs=5) return model
def create_preloaded_fasttext_embeddings_and_vocabulary(fasttext_model_path = "D:/PycharmProjects/TextCategorization/wiki.tr", vocab_file_output_path = "fasttext_tr_vocab_cache.dat", embedding_cache_output_path = "fasttext_tr_embedding_cache.npy"): model = FastText.load_fasttext_format(fasttext_model_path) vocabulary = model.wv.vocab embeddings = np.array([model.wv.word_vec(word) for word in vocabulary.keys()]) with open(vocab_file_output_path, 'wb') as fw: pickle.dump(vocabulary, fw, protocol=pickle.HIGHEST_PROTOCOL) np.save(embedding_cache_output_path, embeddings) print("Preloading ends!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fasttext-model', metavar='<model>', required=True, type=str) parser.add_argument('--output-file', metavar='<output>', required=True, type=argparse.FileType('wb')) parser.add_argument('--bert-tokens', action='store_true') parser.add_argument('files', metavar='<textfile>', nargs='+') args = parser.parse_args() if args.bert_tokens: print('Using bert tokens...') print('Loading model...') ft = FastText.load_fasttext_format(args.fasttext_model) print('Building vocab...', end=' ') tokenizer = Tokenizer(bert_tokenization=args.bert_tokens) vocab = [] for fname in args.files: print(f'Reading {fname}...') with open(fname, 'r', encoding='utf-8') as f: for line in f: for word in line.split(): vocab.extend(tokenizer.tokenize(word)) vocab = list(set(vocab)) # remove duplicates vocab = sorted(vocab) word2idx = {word: i for i, word in enumerate(vocab)} print(f'Vocab size is {len(vocab)}...') print('Precalculating embeddings...') vecs = [] for word in vocab: try: vecs.append(ft.wv[word]) except: print('Unknown word:', word) vecs.append(ft.wv['_']) print('Saving embeddings...') pickle.dump({ 'word2idx': word2idx, 'idx2word': vocab, 'idx2vec': vecs, }, args.output_file) print('Done.')
def __init__(self, modelVersion): super().__init__(ModelType.FASTTEXT, modelVersion) try: self.model = FastText.load(self.modelFullPath) except FileNotFoundError: warnings.warn("Havent found pretreined model in {}".format( self.modelFullPath)) if "pretrained" == modelVersion: cap_path = os.path.join(modelStoragePath, "pretrained/cc.lv.300") print("Loading original pretrained vectors") self.model = FastText.load_fasttext_format(cap_path, full_model=False)
def get_fasttext_embedding_matrix(word_index, max_nb_words): model = fText.load_fasttext_format(FASTTEXT_FILE) nb_words = max_nb_words word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) for word, i in word_index.items(): if i > max_nb_words: continue embedding_vector = model.wv[word] if embedding_vector is not None: word_embedding_matrix[i] = embedding_vector print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0)) return word_embedding_matrix, nb_words
def __init__(self, type="zeros", range=(-0.25, 0.25), fasttext_model_path="None"): assert type == "zeros" or type == "ones" or type == "random" or type == "uniform" or type == "fasttext_oov" self.type = type self.range = range self.fasttext_model_path = fasttext_model_path self.random_emb = None self.uniform_emb = None logger.info("> OOV Embedding mode: %s", self.type) if self.type == "fasttext_oov": assert self.fasttext_model_path is not None logger.info(">> Fasttext model will be loaded and embeddings for OOV words will be calculated by using it!") logger.info(">> Beware that the process may take a while due to this process!") self.model = FastText.load_fasttext_format(self.fasttext_model_path)
def get_fasttext_embedding_matrix(word_index, max_nb_words): model = fText.load_fasttext_format(FASTTEXT_FILE) nb_words = max_nb_words word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) for word, i in word_index.items(): if i > max_nb_words: continue embedding_vector = model.wv[word] if embedding_vector is not None: word_embedding_matrix[i] = embedding_vector print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0)) return word_embedding_matrix, nb_words
def feature_model(X_train, X_test): global ft_model fastText_pretrained = 'crawl-300d-2M-subword.bin' ft_model = FastText.load_fasttext_format(fastText_pretrained, encoding='latin1') X_train_transformed = np.asarray(X_train.map(sen_to_vec).values.tolist()) X_test_transformed = np.asarray(X_test.map(sen_to_vec).values.tolist()) X_train_transformed = X_train_transformed.astype(np.float64) X_test_transformed = X_test_transformed.astype(np.float64) return X_train_transformed, X_test_transformed
def load_fasttext(model_path): """ 加载 FastText 模型(加载时间较长) Args: model_path(str): /path/to/model.bin fasttext GitHub 提供了预训练的模型,可以直接导入 gensim """ logger.info("loading the FastText model...") from gensim.models import FastText model = FastText.load_fasttext_format(model_path) logger.info("loading the FastText model finished.") return model
def create_vectors(): ''' create word embedding without reducing the dimension ''' print('Load word2vec bin file...') word_vectors = FastText.load_fasttext_format('./dataset/wordvector/wiki.en') vec_list = [word_vectors[i] for i in word_vectors.wv.vocab.keys()] print('Writing vectors to file...') vec_file = open('./dataset/wordvector/vec_300.txt', 'w') for word, vec in zip(word_vectors.wv.vocab.keys(), vec_list): vec_str = ' '.join(str(x) for x in vec) vec_file.write(vec_str + '\n') vec_file.close() print('Successfully saved vectors in ./dataset/wordvector directory')
def build_fasttext(filename, context_path, target_path, word_dict, verb_dict, dim): scale = np.sqrt(3.0 / dim) context_emb = np.random.uniform(-scale, scale, [len(word_dict), dim]) target_emb = np.random.uniform(-scale, scale, [len(verb_dict), dim]) fasttext_model = FastText.load_fasttext_format(filename, encoding='utf8') for word in word_dict: idx = word_dict[word] context_emb[idx] = fasttext_model.wv[word] for word in verb_dict: idx = verb_dict[word] target_emb[idx] = fasttext_model.wv[word] np.savez_compressed(context_path, embeddings=context_emb) np.savez_compressed(target_path, embeddings=target_emb)