Ejemplo n.º 1
0
def embedding(emb_type):
    if emb_type == 'w2v':
        model = KeyedVectors.load_word2vec_format(path_to_w2v)
    if emb_type == 'fasttext':
        model = FastText.load_fasttext_format(path_to_fasttext_emb)
    if emb_type == 'fasttext_2':
        print('loading fasttext embedding...')
        model = FastText.load_fasttext_format(path_to_fasttext_emb_2)
        print('Done!')
    if emb_type == 'fasttext_unlem':
        model = FastText.load_fasttext_format(path_to_fasttext_unlem)
    return model
Ejemplo n.º 2
0
    def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str,
                  neighbors: int = DEFAULT_NEIGHBORS_NUMBER,
                  edit_candidates: int = DEFAULT_EDIT_DISTANCE,
                  max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS,
                  max_corrected_length: int = 12) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \
                                First token in every line split is added to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must be two \
                                 values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param neighbors: Number of neighbors of context and typo embeddings \
                          to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup for candidates.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :param max_corrected_length: Maximum length of prefix in which symspell lookup \
                                     for typos is conducted
        """
        self.checker = SymSpell(max_dictionary_edit_distance=max_distance,
                                prefix_length=max_corrected_length)
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.neighbors_number = neighbors
        self.edit_candidates_number = edit_candidates
        self.max_distance = max_distance
        self.radius = radius
        self.tokens = read_vocabulary(vocabulary_file)
        self.frequencies = read_frequencies(frequencies_file)
Ejemplo n.º 3
0
    def load(self, path, model_type='word2vec'):
        """
        Load pre-trained word embedding model and save it into embed_words.__embedding.

        Args:
            path (str): relative path to the file containing pre-trained model.
            model_type (str): type of the model - must be one of the following:'word2vec' for '.vec' files or 'fasttext'
                for '.bin' files.
                (Default = 'word2vec')
        """

        # Code for loading Word2vec model:
        if model_type == 'word2vec':
            self.__model = KeyedVectors.load_word2vec_format(path)
            self.__embedding = self.__model.wv

        # Code for loading fastText model:
        elif model_type == 'fasttext':
            self.__model = FastText.load_fasttext_format(path)
            self.__embedding = self.__model.wv

        # In case we're trying to load an unsupported model type:
        else:
            raise Exception(
                "Model '{}' not supported (must be 'word2vec' or 'fasttext').".
                format(model_type) + " Cannot load word embedding model.")
Ejemplo n.º 4
0
def make_word_embedding(word_dict,
                        word_emb_pkl_path=params['default_word_emb_pkl_path'],
                        fasttext_path=params['default_fasttext_path']):
    word_emb = np.zeros([len(word_dict), params['word_emb_dim']])
    if os.path.isfile(word_emb_pkl_path):
        with open(word_emb_pkl_path, 'rb') as f:
            word_emb = pickle.load(f)
            print('Existed trained word embedding loaded')
    else:
        #load fasttext model
        fasttext_model = FastText.load_fasttext_format(fasttext_path,
                                                       encoding='utf8')
        print('No word_emb pkl file, start making word_emb ...')
        for word, idx in word_dict.items():
            if idx == 0:
                # PAD = 0
                continue
            else:
                try:
                    word_emb[idx] = np.asarray(fasttext_model.wv[word])
                except KeyError:
                    # if there is no word vector for certain word, just assign random vector
                    word_emb[idx] = np.random.uniform(-0.25, 0.25,
                                                      params['word_emb_dim'])
        with open(word_emb_pkl_path, 'wb') as f:
            pickle.dump(word_emb, f)
        print('Making word_emb ... Done and Saved')
    return word_emb
Ejemplo n.º 5
0
def create_data_and_labels_using_fasttext_embeddings(dataset_path, fasttext_model_path, embedding_size=300, language='turkish'):
    print("Loading raw sentences and one hot encoded labels")
    sentences, labels = load_data_and_labels(dataset_path)

    print("Finding maximum sentence length in the dataset")
    max_sentence_length = max([len(sentence.split(" ")) for sentence in sentences])
    print("Loading Fasttext model")
    model = FastText.load_fasttext_format(fasttext_model_path)
    sentence_embedding_list = list()
    print("Transform raw sentences to Fasttext embeddings")
    count = 0
    for sentence_idx, sentence in enumerate(sentences):
        tokens = sentence.split(" ")
        sentence_embedding = np.zeros(shape=(max_sentence_length, embedding_size))
        for idx, word in enumerate(tokens):
            try:
                sentence_embedding[idx] = model[word]
            except KeyError:
                print("Sentence:", sentence, "- Error word:", word)
        sentence_embedding_list.append(sentence_embedding)
        if sentence_idx % 100000 == 0:
            outputPath = "D:/PycharmProjects/TextCategorization/FasttextEmbeddingsWithNGrams/TWNERTC_TC_Coarse Grained NER_No_NoiseReduction_" + str(count)
            np.save(outputPath, np.asarray(sentence_embedding_list))
            sentence_embedding_list.clear()
            count += 1
Ejemplo n.º 6
0
    def load_model(self):

        if self.backend == 'spacy':

            if self.trained_vectors == 'en':

                import en_core_web_sm
                self.model = en_core_web_sm.load()
            
            else:
                self.model = sp.load(self.trained_vectors)

        elif self.backend == 'gensim':

            if self.mode is None:
                self.mode = 'glove'

            if self.mode == 'glove':
                glove2word2vec(self.trained_vectors, self.temp)
                self.model = KeyedVectors.load_word2vec_format(self.temp,
                                                               binary=self.binary)
                os.remove(self.temp)

            elif self.mode == 'word2vec':
                self.model = KeyedVectors.load_word2vec_format(self.trained_vectors,
                                                               binary=self.binary,
                                                               encoding='latin-1')
            elif self.mode == 'fasttext':
                self.model = FastText.load_fasttext_format(self.trained_vectors,
                                                           encoding='latin-1')
Ejemplo n.º 7
0
    def read_fasttext(self, file):
        """
        Create an Embeddings Matrix, in which each row corresponds to
        the word vector from the pretrained word embeddings.
        If a word is missing then obtain a representation on-the-fly
        using fasttext.

        Args:
            file:
            dim:

        Returns:

        """
        model = FastText.load_fasttext_format(file)

        embeddings = numpy.zeros((len(self), model.vector_size))

        missing = []

        for token_id, token in tqdm(self.id2tok.items(),
                                    desc="Reading embeddings...",
                                    total=len(self.id2tok.items())):
            if token not in model.wv.vocab:
                missing.append(token)
            embeddings[token_id] = model[token]

        print(f"Missing tokens from the pretrained embeddings: {len(missing)}")

        return embeddings, missing
Ejemplo n.º 8
0
def load_wordvectors(load_vec_file=False):
    """
    Loads and returns fastText word embedding.

    :param load_vec_file: Bool indicating whether to load `.txt` or `.bin` file.
    :return word_embeddings: fastText word embedding.
    :rtype: map(str): array_type
    """
    if load_vec_file:
        """From https://fasttext.cc/docs/en/english-vectors.html"""
        fname = './Data/wiki-news-300d-1M.vec'
        fin = io.open(fname,
                      'r',
                      encoding='utf-8',
                      newline='\n',
                      errors='ignore')
        n, d = map(int, fin.readline().split())
        word_embeddings = {}
        for line in fin:
            tokens = line.rstrip().split(' ')
            word_embeddings[tokens[0]] = map(float, tokens[1:])
    else:
        from gensim.models import FastText
        word_embeddings = FastText.load_fasttext_format('./Data/cc.en.300.bin')
    return word_embeddings
 def __getitem__(self, key):
     if key not in self:
         path = self.wv_path.format(key)
         print("Loading FastText for", key)
         value = ft.load_fasttext_format(path)
         self[key] = value
     return super().__getitem__(key)
Ejemplo n.º 10
0
    def build_emb(self, vectors, vocab, embdim=300, model_path=None):
        if model_path:
            ft = FastText.load_fasttext_format(model_path)

        mat = []
        no_vectors = {}
        embedding_matrix = np.zeros((len(vocab.keys()) + 1, embdim))
        c = 0
        for i, (word, idx) in enumerate(vocab.items()):
            if model_path:
                try:
                    vect = ft[word]
                except:
                    vect = None
            else:
                vect = vectors.get(word)

            if vect is not None and len(vect) > 0:
                embedding_matrix[i] = vect
                c += 1
            else:
                no_vectors[word] = idx
        print("{} words were found over a vocab of {} which is a ratio of {}"\
              .format(c, len(vocab.items()), round(c/len(vocab), 2) ))

        return embedding_matrix, no_vectors
Ejemplo n.º 11
0
 def test_get_fasttext_model(self):
     data = pandas.read_csv(str(TEST_DATA_DIR / "prepared_data.csv.xz"),
                            index_col=0, keep_default_na=False)
     with tempfile.TemporaryDirectory(prefix="lookout_typos_fasttext_") as temp_dir:
         config = {"size": 100, "path": os.path.join(temp_dir, "ft.bin"), "dim": 5}
         train_fasttext(data, config)
         model = FastText.load_fasttext_format(config["path"])
         self.assertTupleEqual(model.wv["get"].shape, (5,))
def loadfasttext(embfile):
    """
    Load fasttext embeddings
    :param embfile:
    :return:
    """
    model = fText.load_fasttext_format(embfile)
    return model
Ejemplo n.º 13
0
def make_embedding(extra_word):

    with open('./data/data/save_data.test.dict.trans.cz',
              'r',
              encoding='utf-8') as f:
        lines_test = f.readlines()

    # w2vModel = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
    fastTextModel = FastText.load_fasttext_format('./wiki.cs.bin')

    vocab_sz = len(lines_test)
    emb_dim = 300
    weight_matrix_test = np.zeros((vocab_sz, 300))
    count = 0
    failed = []
    for i, line in enumerate(lines_test):
        label = line.strip()  # 之前还split[0]了,愚蠢啊 和dict文件的格式不一样的
        try:
            weight_matrix_test[i] = fastTextModel[label]
        except:
            count += 1
            failed.append(label)
            weight_matrix_test[i] = np.random.normal(size=(emb_dim, ))
    print('failed count {}, {}'.format(count, failed))
    weight_matrix_tgt_np = torch.from_numpy(
        weight_matrix_test[:-1 * extra_word]).float()
    weight_matrix_test_np = torch.from_numpy(weight_matrix_test).float()

    with open('./data/data/save_data.src.dict', 'r', encoding='utf-8') as f:
        lines = f.readlines()

    vocab_sz = len(lines)
    emb_dim = 300
    weight_matrix_src = np.zeros((vocab_sz, 300))
    count = 0
    failed = []
    for i, line in enumerate(lines):
        label = line.strip().split()[0]
        try:
            weight_matrix_src[i] = fastTextModel[label]
        except:
            count += 1
            failed.append(label)
            weight_matrix_src[i] = np.random.normal(size=(emb_dim, ))

    print('failed count {}, {}'.format(count, failed))

    weight_matrix_src_np = torch.from_numpy(weight_matrix_src).float()
    weight_matrix_train = {}
    weight_matrix_train['src_emb'] = weight_matrix_src_np
    weight_matrix_train['tgt_emb'] = weight_matrix_tgt_np

    weight_matrix_test = {}
    weight_matrix_test['src_emb'] = weight_matrix_src_np
    weight_matrix_test['tgt_emb'] = weight_matrix_test_np

    torch.save(weight_matrix_train, './data/data/weight_matrix_train')
    torch.save(weight_matrix_test, './data/data/weight_matrix_test')
def convert_fasttext(model_path: str, output_path: str):
    print("Loading ...")
    model = FastText.load_fasttext_format(model_path)
    print("Saving ...")
    model.wv.save(output_path)
    print("Sanity check ...", end=" ")
    saved_model = KeyedVectors.load(output_path, mmap="r")
    np.testing.assert_allclose(saved_model["okej"], model.wv["okej"])
    print("\u2713")  # tick mark
Ejemplo n.º 15
0
def load_embeddings(args):
    if (args.fasttext):
        embeddings_dict = FastText.load_fasttext_format(args.fasttext)
    elif (args.emb):
        embeddings_dict = np.load(args.emb).item()
    else:
        print("Error - No embeddings specified")

    return embeddings_dict, len(embeddings_dict['the'])
Ejemplo n.º 16
0
def load_fasttext():
    _fasttext = FastText.load_fasttext_format(path_fasttext)
    fasttext_dict = {}
    for word in tqdm(_fasttext.wv.vocab):
        fasttext_dict[word] = _fasttext[word]

    del _fasttext

    return fasttext_dict
Ejemplo n.º 17
0
def load_fasttext_embeddings(file: Union[Path, str]) -> FastTextKeyedVectors:
    """Load embeddings from file and unit normalize vectors."""
    if isinstance(file, str):
        file = Path(file)
    # Detect the model format by its extension:
    if '.bin' in file.suffixes or '.vec' in file.suffixes:
        # Binary word2vec format
        emb_model = FastText.load_fasttext_format(str(file))
    elif file.suffix == '.zip':
        # ZIP archive from the NLPL vector repository
        with zipfile.ZipFile(str(file), "r") as archive:
            model_file = archive.extract('parameters.bin')
            emb_model = FastText.load_fasttext_format(model_file)
    else:
        # Native Gensim format?
        emb_model = FastText.load(str(file))

    # Unit-normalizing the vectors (if they aren't already)
    emb_model.init_sims(replace=True)
    return emb_model.wv
Ejemplo n.º 18
0
	def load_model(self):
		print("Loading model...")
		if self.representation=='fasttext':
			model_path = "../crawl-300d-2M-subword.bin"
			self.word_model = FastText.load_fasttext_format(model_path,encoding='utf-8')

		if self.representation=='GloVe':
			from gensim.models import KeyedVectors
			model_path = "../glove.27B.100d.word2vec.txt"
			self.word_model = KeyedVectors.load_word2vec_format(model_path)
		self.w2v = dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0))
Ejemplo n.º 19
0
 def get_wordvector(self):
     print("Downloading...")
     r = urlopen(self.zip_url)
     with BytesIO(r.read()) as b:
         print("Extracting...")
         with ZipFile(b) as z:
             with z.open(self.filename) as zf, open(self.path, "wb") as f:
                 shutil.copyfileobj(zf, f)
     print("Loading...")
     model = ft.load_fasttext_format(self.path)
     os.remove(self.path)
     return model
Ejemplo n.º 20
0
 def load(self, path):
     print("loading resources, this may take some minutes ... ")
     x = invertedIndex()
     self.idf_table = x.init(path)
     self.lexicon = x.lex
     self.n_docs = x.n_docs
     self.max_len = x.max_len
     y = Sentiment('../vader.txt')
     self.sent_table = y.sentiments
     self.senti_lex = y.sent_lex
     self.embeds = FastText.load_fasttext_format("../cc.en.300.bin")
     self.stopwords = set(stopwords.words('english'))
Ejemplo n.º 21
0
def fasttext_model_train(data, from_scratch):
  # Preprocessing like stopword removal @TODO
  ge_sentences = [ list(tokenize(s)) for s in data['text'].to_list()]
  if from_scratch:
    model = FastText(bucket= 1000000, window=3, min_count=1, size=300)
    model.build_vocab(sentences=ge_sentences)
    model.train(sentences=ge_sentences, total_examples=len(ge_sentences), epochs=10)
  else:
    model = FastText.load_fasttext_format('content/cc.en.300')
    model.build_vocab(ge_sentences, update=True)
    model.train(sentences=ge_sentences, total_examples = len(sent), epochs=5)
  return model
Ejemplo n.º 22
0
def create_preloaded_fasttext_embeddings_and_vocabulary(fasttext_model_path = "D:/PycharmProjects/TextCategorization/wiki.tr",
                                                        vocab_file_output_path = "fasttext_tr_vocab_cache.dat",
                                                        embedding_cache_output_path = "fasttext_tr_embedding_cache.npy"):
    model = FastText.load_fasttext_format(fasttext_model_path)
    vocabulary = model.wv.vocab
    embeddings = np.array([model.wv.word_vec(word) for word in vocabulary.keys()])

    with open(vocab_file_output_path, 'wb') as fw:
        pickle.dump(vocabulary, fw, protocol=pickle.HIGHEST_PROTOCOL)

    np.save(embedding_cache_output_path, embeddings)
    print("Preloading ends!")
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fasttext-model',
                        metavar='<model>',
                        required=True,
                        type=str)
    parser.add_argument('--output-file',
                        metavar='<output>',
                        required=True,
                        type=argparse.FileType('wb'))
    parser.add_argument('--bert-tokens', action='store_true')
    parser.add_argument('files', metavar='<textfile>', nargs='+')

    args = parser.parse_args()

    if args.bert_tokens:
        print('Using bert tokens...')

    print('Loading model...')
    ft = FastText.load_fasttext_format(args.fasttext_model)

    print('Building vocab...', end=' ')
    tokenizer = Tokenizer(bert_tokenization=args.bert_tokens)
    vocab = []
    for fname in args.files:
        print(f'Reading {fname}...')
        with open(fname, 'r', encoding='utf-8') as f:
            for line in f:
                for word in line.split():
                    vocab.extend(tokenizer.tokenize(word))
    vocab = list(set(vocab))  # remove duplicates
    vocab = sorted(vocab)
    word2idx = {word: i for i, word in enumerate(vocab)}
    print(f'Vocab size is {len(vocab)}...')

    print('Precalculating embeddings...')
    vecs = []
    for word in vocab:
        try:
            vecs.append(ft.wv[word])
        except:
            print('Unknown word:', word)
            vecs.append(ft.wv['_'])

    print('Saving embeddings...')
    pickle.dump({
        'word2idx': word2idx,
        'idx2word': vocab,
        'idx2vec': vecs,
    }, args.output_file)

    print('Done.')
Ejemplo n.º 24
0
 def __init__(self, modelVersion):
     super().__init__(ModelType.FASTTEXT, modelVersion)
     try:
         self.model = FastText.load(self.modelFullPath)
     except FileNotFoundError:
         warnings.warn("Havent found pretreined model in {}".format(
             self.modelFullPath))
         if "pretrained" == modelVersion:
             cap_path = os.path.join(modelStoragePath,
                                     "pretrained/cc.lv.300")
             print("Loading original pretrained vectors")
             self.model = FastText.load_fasttext_format(cap_path,
                                                        full_model=False)
Ejemplo n.º 25
0
def get_fasttext_embedding_matrix(word_index, max_nb_words):
    model = fText.load_fasttext_format(FASTTEXT_FILE)
    nb_words = max_nb_words
    word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > max_nb_words:
            continue
        embedding_vector = model.wv[word]
        if embedding_vector is not None:
            word_embedding_matrix[i] = embedding_vector

    print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
    return word_embedding_matrix, nb_words
Ejemplo n.º 26
0
 def __init__(self, type="zeros", range=(-0.25, 0.25), fasttext_model_path="None"):
     assert type == "zeros" or type == "ones" or type == "random" or type == "uniform" or type == "fasttext_oov"
     self.type = type
     self.range = range
     self.fasttext_model_path = fasttext_model_path
     self.random_emb = None
     self.uniform_emb = None
     logger.info("> OOV Embedding mode: %s", self.type)
     if self.type == "fasttext_oov":
         assert self.fasttext_model_path is not None
         logger.info(">> Fasttext model will be loaded and embeddings for OOV words will be calculated by using it!")
         logger.info(">> Beware that the process may take a while due to this process!")
         self.model = FastText.load_fasttext_format(self.fasttext_model_path)
Ejemplo n.º 27
0
def get_fasttext_embedding_matrix(word_index, max_nb_words):
    model = fText.load_fasttext_format(FASTTEXT_FILE)
    nb_words = max_nb_words
    word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > max_nb_words:
            continue
        embedding_vector = model.wv[word]
        if embedding_vector is not None:
            word_embedding_matrix[i] = embedding_vector

    print('Null word embeddings: %d' %
          np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
    return word_embedding_matrix, nb_words
Ejemplo n.º 28
0
def feature_model(X_train, X_test):

    global ft_model

    fastText_pretrained = 'crawl-300d-2M-subword.bin'
    ft_model = FastText.load_fasttext_format(fastText_pretrained,
                                             encoding='latin1')

    X_train_transformed = np.asarray(X_train.map(sen_to_vec).values.tolist())
    X_test_transformed = np.asarray(X_test.map(sen_to_vec).values.tolist())

    X_train_transformed = X_train_transformed.astype(np.float64)
    X_test_transformed = X_test_transformed.astype(np.float64)

    return X_train_transformed, X_test_transformed
Ejemplo n.º 29
0
def load_fasttext(model_path):
    """
    加载 FastText 模型(加载时间较长)

    Args:
        model_path(str): /path/to/model.bin
            fasttext GitHub 提供了预训练的模型,可以直接导入 gensim
    """
    logger.info("loading the FastText model...")

    from gensim.models import FastText
    model = FastText.load_fasttext_format(model_path)

    logger.info("loading the FastText model finished.")
    return model
Ejemplo n.º 30
0
def create_vectors():
    '''
    create word embedding without reducing the dimension
    '''
    print('Load word2vec bin file...')
    word_vectors = FastText.load_fasttext_format('./dataset/wordvector/wiki.en')
    vec_list = [word_vectors[i] for i in word_vectors.wv.vocab.keys()]
    
    print('Writing vectors to file...')
    vec_file = open('./dataset/wordvector/vec_300.txt', 'w')
    for word, vec in zip(word_vectors.wv.vocab.keys(), vec_list):
        vec_str = ' '.join(str(x) for x in vec)
        vec_file.write(vec_str + '\n')
    vec_file.close()
    print('Successfully saved vectors in ./dataset/wordvector directory')
Ejemplo n.º 31
0
def build_fasttext(filename, context_path, target_path, word_dict, verb_dict, dim):
    scale = np.sqrt(3.0 / dim)
    context_emb = np.random.uniform(-scale, scale, [len(word_dict), dim])
    target_emb = np.random.uniform(-scale, scale, [len(verb_dict), dim])
    fasttext_model = FastText.load_fasttext_format(filename, encoding='utf8')

    for word in word_dict:
        idx = word_dict[word]
        context_emb[idx] = fasttext_model.wv[word]

    for word in verb_dict:
        idx = verb_dict[word]
        target_emb[idx] = fasttext_model.wv[word]

    np.savez_compressed(context_path, embeddings=context_emb)
    np.savez_compressed(target_path, embeddings=target_emb)