Example #1
0
def update_fasttext():

    model = FastText.load_fasttext_format('/data/dataset/wiki.zh')

    # 注意,因为用的是fasttext 官方的模型,model.corpus_count 是0
    # 需要手动指定一个 corpus_count
    model.train([['今天', '天气', '很好', '。']], total_examples=model.corpus_count)
Example #2
0
 def test_load_model_with_non_ascii_vocab(self):
     model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except UnicodeDecodeError:
         self.fail('Unable to access vector for utf8 encoded non-ascii word')
Example #3
0
    def _load_gensim_format_embeddings(self):
        if not os.path.exists(self.word_embedding_file):
            raise Exception("{} is not found!".format(
                self.word_embedding_file))

        if self.word_embedding_mode.lower() == "fasttext":

            if self.word_embedding_file.endswith(".model"):
                model = FastText.load(self.word_embedding_file)
            else:
                model = FastText.load_fasttext_format(self.word_embedding_file)

        elif self.word_embedding_file.endswith(".bin"):
            model = KeyedVectors.load_word2vec_format(self.word_embedding_file,
                                                      binary=True)
        else:
            model = Word2Vec.load(self.word_embedding_file)

        embedding_size = model["and"].size
        unknown_vec = np.random.uniform(-0.25, 0.25, embedding_size)

        embeddings = [unknown_vec] * (self.n_words())
        embeddings[0] = np.zeros(embedding_size)
        for word in self.word2index:
            try:
                embeddings[self.word2index[word]] = model[word]
            except:
                # self.word2index[word] = self.word2index[self.UNK_TOKEN]
                pass

        self.word_embedding_size = len(embeddings[0])
        embeddings = np.array(embeddings, dtype=np.float32)

        return embeddings
Example #4
0
def create_and_train_nn_prediction_from_file(
        fasttext: str,
        data: str,
        dump: str = None,
        num_neurons: int = DEFAULT_NUM_NEURONS,
        batch_size: int = DEFAULT_BATCH_SIZE,
        lr: float = DEFAULT_LR,
        decay: float = DEFAULT_DECAY,
        num_epochs: int = DEFAULT_NUM_EPOCHS) -> keras.models.Sequential:
    """
    Train NN model for correction embedding prediction from files.

    :param fasttext: Path to the binary dump of a FastText model.
    :param data: Path to a CSV dump of pandas.DataFrame containing columns \
                 [Columns.CorrectToken, Columns.Token].
    :param dump: Path to the file where to dump the trained NN model.
    :param num_neurons: Number of neurons in each hidden layer.
    :param batch_size: Batch size for training.
    :param lr: Learning rate.
    :param decay: Learning rate exponential decay per epoch.
    :param num_epochs: Number of training passes over the dataset.
    :return: Trained Keras model.
    """
    fasttext_model = FastText.load_fasttext_format(fasttext)
    df = pandas.read_csv(data, index_col=0).infer_objects()
    return create_and_train_nn_prediction(fasttext_model, df, dump,
                                          num_neurons, batch_size, lr, decay,
                                          num_epochs)
Example #5
0
 def test_load_model_non_utf8_encoding(self):
     model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except KeyError:
         self.fail('Unable to access vector for cp-852 word')
Example #6
0
 def test_load_model_with_non_ascii_vocab(self):
     model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except UnicodeDecodeError:
         self.fail('Unable to access vector for utf8 encoded non-ascii word')
Example #7
0
def load_fasttext(path):

    model = FastText.load_fasttext_format(path)

    model.init_sims(replace=True)

    return model
Example #8
0
def _embedding_load_pre_fasttext(pretrained_path):
	try:
		model_embedding = FastText.load_fasttext_format(pretrained_path)
		print("[+] FastText Embedding model successfully loaded from {}".format(pretrained_path))
		return model_embedding
	except:
		raise FileNotFoundError("[!] FastText Embedding model couldn't be loaded from {}".format(pretrained_path))
Example #9
0
 def test_load_model_non_utf8_encoding(self):
     model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except KeyError:
         self.fail('Unable to access vector for cp-852 word')
Example #10
0
 def __init__(self, project, embeddingSize):
     self.ft = FT.load_fasttext_format('./EmbeddingModel/fastText.bin')
     self.embeddingSize = embeddingSize
     try:
         self.embedder = torch.load('./EmbeddingModel/{}-EmbeddingLayer.pt'.format(project))
     except FileNotFoundError:
         self.embedder = nn.Embedding(num_embeddings=len(super().wordSet), embedding_dim=embeddingSize)
         torch.save(self.embedder, './EmbeddingModel/{}-EmbeddingLayer.pt'.format(project))
Example #11
0
def load_embeddings(embedding_file):
    '''
    loading embeddings from file
    input: embeddings
    output: embeddings in a dict-like structure available for look-up, vocab covered by the embeddings as a set
    '''

    print('Using embeddings: ', embedding_file)
    if embedding_file.endswith('.txt') or embedding_file.endswith('.vec'):
        w2v = {}
        vocab = []
        try:
            f = open(embedding_file, 'r')
            for line in f:
                values = line.split()
                word = values[0]
                try:
                    float(values[1])
                except ValueError:
                    continue
                coefs = np.asarray(values[1:], dtype='float')
                w2v[word] = coefs
                vocab.append(word)
        except UnicodeDecodeError:
            f = open(embedding_file, 'rb')
            for line in f:
                values = line.split()
                word = values[0]
                try:
                    float(values[1])
                except ValueError:
                    continue
                coefs = np.asarray(values[1:], dtype='float')
                w2v[word] = coefs
                vocab.append(word)
        f.close()

    else:

        try:
            w2v = FT_gensim.load(embedding_file)
            vocab = w2v.wv.vocab.keys()
            print('using FastText gensim...')
        except:
            try:
                w2v = FT_gensim.load_fasttext_format(embedding_file)
                vocab = w2v.wv.vocab.keys()
                print('using gensim Facebook FastText...')
            except:
                w2v, vocab = load_vectors(embedding_file)
                print('using Facebook fastText')

    try:
        print("Done.", len(w2v), " words loaded!")
    except:
        pass

    return w2v, vocab
Example #12
0
    def __init__(self, train_X, train_Y, test_X, test_Y, model):
        self.model_path = model
        # embedding_index = {}
        # for i, line in enumerate(open('glove.6B/glove.6B.100d.txt')):
        #     values = line.split()
        #     embedding_index[values[0]] = np.asarray(values[1:], dtype='float32')
        embedding_index = FastText.load_fasttext_format('cc.id.300.bin')

        # Create tokenizer object
        tokenizer = text.Tokenizer()
        tokenizer.fit_on_texts(train_X)
        word_index = tokenizer.word_index

        # Convert text to padded sequence of tokens and load previous model if available, disable train method
        self.test_seq_X = sequence.pad_sequences(tokenizer.texts_to_sequences(test_X), maxlen=70)
        if os.path.isfile(self.model_path):
            self.classifier = load_model(self.model_path)
            return

        # Save if no previous model loaded
        self.train_seq_X = sequence.pad_sequences(tokenizer.texts_to_sequences(train_X), maxlen=70)
        self.train_Y = train_Y
        self.test_Y = test_Y

        if os.path.isfile(self.model_path):
            self.classifier = load_model(self.model_path)
            return

        # Create word embeddings mapping
        embedding_matrix = np.zeros((len(word_index) + 1), 300)
        for word, i in word_index.items():
            embedding_vector = embedding_index.wv.most_similar(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # Creating layer
        # Add input layer
        input_layer = layers.Input((70, ))

        # Add the word embedding layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add LSTM layer
        lstm_layer = layers.LSTM(self.hidden_state)(embedding_layer)

        # Output layers
        output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
        output_layer1 = layers.Dropout(0.25)(output_layer1)
        output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

        # Compile model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')

        self.classifier = model
              
        logging.info("LSTM model created")
def load_word_embeddings_bin(filename, algorithm='fasttext'):
    print('loading model...')
    global wv_model
    if (algorithm == 'fasttext'):
        wv_model = FastText.load_fasttext_format(filename)
    elif (algorithm == 'word2vec'):
        wv_model = KeyedVectors.load_word2vec_format(filename, encoding='utf8', binary=True)
    print('Done!')
    return wv_model
Example #14
0
    def __init__(self, f_emb: str, f_db):
        super().__init__()

        log.info('loading embeddings')
        self._emb = FastText.load_fasttext_format(f_emb)

        log.info('loading ungol index')
        with open(f_db, 'rb') as fd:
            self.db = pickle.load(fd)
Example #15
0
def load_fasttext_model(path):
    try:
        model = FastText.load(path).wv
    except Exception as e:
        try:
            model = FastText.load_fasttext_format(path).wv
        except Exception as e:
            model = gensim.models.KeyedVectors.load(path)

    return model
Example #16
0
def embedding_yukle_fasttext(dizin):
    try:
        model_embedding = FastText.load_fasttext_format(dizin)
        print("[+] Embedding modeli (pretrained) {} dosyasindan yuklendi.".
              format(dizin))
        return model_embedding
    except:
        raise FileNotFoundError(
            "[!] Embedding modeli (pretrained) {} dosyasindan yuklenemedi.".
            format(dizin))
def import_pretrained_fasttext(in_file, out_file, mmap='r'):
    try:
        return GensimKeyedVectors.load(out_file, mmap=mmap)
    except IOError:
        pass
    ft = FastText.load_fasttext_format(in_file)
    ft.init_sims(replace=True)
    model = GensimKeyedVectors(ft.wv)
    model.save(out_file)
    return GensimKeyedVectors.load(out_file, mmap=mmap)
Example #18
0
def load_embedding(embedding_path,
                   embedding_dim,
                   format,
                   file_type,
                   with_head=False,
                   word_set=None):
    """
    Args:
        format: 'glove', 'word2vec', 'fasttext'
        file_type: 'text' or 'binary'
    """
    embedding_dict = dict()

    if format == 'word2vec' or format == 'fasttext':
        if file_type == 'text':
            vector_total = KeyedVectors.load_word2vec_format(
                embedding_path, binary=False, unicode_errors='ignore')
        else:
            if format == 'word2vec':
                vector_total = KeyedVectors.load_word2vec_format(
                    embedding_path, binary=True, unicode_errors='ignore')
            elif format == 'fasttext':
                vector_total = FastText.load_fasttext_format(embedding_path,
                                                             encoding='utf8')

        assert vector_total.vector_size == embedding_dim
        if word_set is None:
            embedding_dict = vector_total
        else:
            if not (format == 'fasttext' and file_type == 'binary'):
                word_total = vector_total.index2word  # actually, vector_total.index2word is the word list
            else:
                word_total = vector_total.wv.index2word
            for word in word_total:
                if word in word_set:
                    embedding_dict[word] = vector_total[word]
    elif format == 'glove':
        with codecs.open(embedding_path, 'r', encoding='utf-8') as fin:
            if with_head == True:
                _ = fin.readline()
            for idx, line in enumerate(fin):
                line = line.rstrip()
                if idx == 0 and len(line.split()) == 2:
                    continue
                if len(line) > 0:
                    word, vec = line.split(" ", 1)
                    if (word_set and word in word_set) or (word_set is None):
                        vector = [float(num) for num in vec.split(" ")]
                        assert len(vector) == embedding_dim
                        embedding_dict[word] = vector
    else:
        raise Exception(
            'The format supported are glove, word2vec or fasttext, dost not support %s now.'
            % format)
    return embedding_dict
def create_fasttext_embeddings(words, embeddings_path):
    print("Loading fasttext model...")
    ff_model = FastText.load_fasttext_format(embeddings_path)
    fasttext_word_dict = dict()
    for w in words:
        try:
            fasttext_word_dict[w] =ff_model[w]
        except:
            pass
    print "total words not found in Fasttext:", len(words) - len(fasttext_word_dict)
    return fasttext_word_dict
Example #20
0
def make_pretrained_embeddings(word_index):
    model = FastText.load_fasttext_format('/models/wiki.ru')

    # create embedding_matrix: "index of word - vector"
    embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM))
    for lem, i in word_index.items():
        if lem in model:
            embedding_matrix[i] = model[lem]

    # write embedding_matrix to file in the output directory of Floyd
    np.save('/output/embedding_matrix_lemma.npy', embedding_matrix)

    return embedding_matrix
Example #21
0
    def test_load_fasttext_new_format(self):
        try:
            new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
        except Exception as exc:
            self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
        vocab_size, model_size = 1763, 10
        self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size))
        self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size)
        self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size))

        expected_vec = [
            -0.025627,
            -0.11448,
            0.18116,
            -0.96779,
            0.2532,
            -0.93224,
            0.3929,
            0.12679,
            -0.19685,
            -0.13179
        ]  # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin
        self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4))

        # vector for oov words are slightly different from original FastText due to discarding unused ngrams
        # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin
        expected_vec_oov = [
            -0.53378,
            -0.19,
            0.013482,
            -0.86767,
            -0.21684,
            -0.89928,
            0.45124,
            0.18025,
            -0.14128,
            0.22508
        ]
        self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4))

        self.assertEqual(new_model.min_count, 5)
        self.assertEqual(new_model.window, 5)
        self.assertEqual(new_model.iter, 5)
        self.assertEqual(new_model.negative, 5)
        self.assertEqual(new_model.sample, 0.0001)
        self.assertEqual(new_model.bucket, 1000)
        self.assertEqual(new_model.wv.max_n, 6)
        self.assertEqual(new_model.wv.min_n, 3)
        self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size))
        self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size))
Example #22
0
    def test_load_fasttext_format(self):
        try:
            model = FT_gensim.load_fasttext_format(self.test_model_file)
        except Exception as exc:
            self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
        vocab_size, model_size = 1762, 10
        self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size))
        self.assertEqual(len(model.wv.vocab), vocab_size, model_size)
        self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size))

        expected_vec = [
            -0.57144,
            -0.0085561,
            0.15748,
            -0.67855,
            -0.25459,
            -0.58077,
            -0.09913,
            1.1447,
            0.23418,
            0.060007
        ]  # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin
        self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4))

        # vector for oov words are slightly different from original FastText due to discarding unused ngrams
        # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin
        expected_vec_oov = [
            -0.23825,
            -0.58482,
            -0.22276,
            -0.41215,
            0.91015,
            -1.6786,
            -0.26724,
            0.58818,
            0.57828,
            0.75801
        ]
        self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4))

        self.assertEqual(model.min_count, 5)
        self.assertEqual(model.window, 5)
        self.assertEqual(model.iter, 5)
        self.assertEqual(model.negative, 5)
        self.assertEqual(model.sample, 0.0001)
        self.assertEqual(model.bucket, 1000)
        self.assertEqual(model.wv.max_n, 6)
        self.assertEqual(model.wv.min_n, 3)
        self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size))
        self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size))
Example #23
0
    def test_load_fasttext_new_format(self):
        try:
            new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
        except Exception as exc:
            self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
        vocab_size, model_size = 1763, 10
        self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size))
        self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size)
        self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size))

        expected_vec = [
            -0.025627,
            -0.11448,
            0.18116,
            -0.96779,
            0.2532,
            -0.93224,
            0.3929,
            0.12679,
            -0.19685,
            -0.13179
        ]  # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin
        self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4))

        # vector for oov words are slightly different from original FastText due to discarding unused ngrams
        # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin
        expected_vec_oov = [
            -0.49111,
            -0.13122,
            -0.02109,
            -0.88769,
            -0.20105,
            -0.91732,
            0.47243,
            0.19708,
            -0.17856,
            0.19815
        ]
        self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4))

        self.assertEqual(new_model.min_count, 5)
        self.assertEqual(new_model.window, 5)
        self.assertEqual(new_model.iter, 5)
        self.assertEqual(new_model.negative, 5)
        self.assertEqual(new_model.sample, 0.0001)
        self.assertEqual(new_model.bucket, 1000)
        self.assertEqual(new_model.wv.max_n, 6)
        self.assertEqual(new_model.wv.min_n, 3)
        self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size))
        self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size))
Example #24
0
    def test_load_fasttext_format(self):
        try:
            model = FT_gensim.load_fasttext_format(self.test_model_file)
        except Exception as exc:
            self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
        vocab_size, model_size = 1762, 10
        self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size))
        self.assertEqual(len(model.wv.vocab), vocab_size, model_size)
        self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size))

        expected_vec = [
            -0.57144,
            -0.0085561,
            0.15748,
            -0.67855,
            -0.25459,
            -0.58077,
            -0.09913,
            1.1447,
            0.23418,
            0.060007
        ]  # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin
        self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4))

        # vector for oov words are slightly different from original FastText due to discarding unused ngrams
        # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin
        expected_vec_oov = [
            -0.21929,
            -0.53778,
            -0.22463,
            -0.41735,
            0.71737,
            -1.59758,
            -0.24833,
            0.62028,
            0.53203,
            0.77568
        ]
        self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4))

        self.assertEqual(model.min_count, 5)
        self.assertEqual(model.window, 5)
        self.assertEqual(model.iter, 5)
        self.assertEqual(model.negative, 5)
        self.assertEqual(model.sample, 0.0001)
        self.assertEqual(model.bucket, 1000)
        self.assertEqual(model.wv.max_n, 6)
        self.assertEqual(model.wv.min_n, 3)
        self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size))
        self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size))
Example #25
0
def get_pretrained_embeddings(filename, vocab):
    def get_vectors(filename):
        objects = dict()
        with (open(filename, "rb")) as openfile:
            while True:
                try:
                    objects = pickle.load(openfile)
                except EOFError:
                    break
        return objects

    # vectors = embeddings.load_from_dir(filename)
    model = FastText.load_fasttext_format(filename)
    size = model.vector_size
    embs_matrix = np.random.rand(len(vocab), size)

    for i, token in enumerate(vocab.token2id):
        if token in model:
            embs_matrix[i] = model[token]

    return torch.FloatTensor(embs_matrix)
Example #26
0
def train():
    print('Loading fasttext...')
    fasttext = FastText.load_fasttext_format(path_fasttext)
    fasttext_dict = {}
    for word in tqdm(fasttext.wv.vocab):
        fasttext_dict[word] = fasttext[word]

    del fasttext

    print('Counting input...')
    count_lines = 0
    with open(path_news_shuffled, 'r') as in_news:
        for _ in tqdm(in_news):
            count_lines += 1

    train_size = int(count_lines * .8)
    test_size = int(count_lines * .8)
    val_size = count_lines - (int(count_lines * 0.8) + int(count_lines * 0.1))

    print('Train size:', train_size, '; test size:', test_size, '; val size:',
          val_size)

    print('Training...')
    with tf.device('/gpu:0'):
        cnn_model = bilstm_model(units=[128, 64, 32], hidden_dims=18)
        checkpoint = ModelCheckpoint(
            path_data + 'bilstm_all_1_weights.{epoch:03d}-{val_acc:.4f}.hdf5',
            monitor='val_acc',
            verbose=1,
            mode='auto')
        cnn_model.fit_generator(embedded_news_generator_all(
            path_news_train_all, batch_size, fasttext_dict, max_words),
                                steps_per_epoch=train_size // batch_size,
                                epochs=epochs,
                                verbose=1,
                                validation_data=embedded_news_generator_all(
                                    path_news_val_all, batch_size,
                                    fasttext_dict, max_words),
                                validation_steps=val_size // batch_size,
                                callbacks=[checkpoint])
Example #27
0
    def _load_subword_embeddings(self):
        if not os.path.exists(self.word_embedding_file):
            raise Exception("{} is not found!".format(
                self.word_embedding_file))

        model = FastText.load_fasttext_format(self.word_embedding_file)
        embedding_size = model["and"].size
        unknown_vec = np.random.uniform(-0.25, 0.25, embedding_size)

        embeddings = [unknown_vec] * (self.n_words())
        embeddings[0] = np.zeros(embedding_size)
        for word in self.word2index:
            try:
                embeddings[self.word2index[word]] = model[word]
            except:
                # self.word2index[word] = self.word2index[self.UNK_TOKEN]
                pass

        self.word_embedding_size = len(embeddings[0])
        embeddings = np.array(embeddings, dtype=np.float32)

        return embeddings
Example #28
0
    def load_model(self):
        files, dirs = get_dir(self.sourceDict["smallPath"])
        fileToLoad = [file for file in files if ".bin" in file]
        if len(fileToLoad) > 0:
            try:
                print("Loading with FastText.load")
                model = FastText.load(fileToLoad[0])
                print(fileToLoad[0] + " was loaded.")
                return model
            except:
                try:
                    print("Loading with FastText.load_fasttext_format")
                    model = FastText.load_fasttext_format(fileToLoad[0])
                    print(fileToLoad[0] + " was loaded.")
                    return model
                except:
                    print("Unable to load " + fileToLoad[0] + " file. Please retrain or redl weights.")


        else:
            print("No model found. Please dl pretrained weights or train custom ones.")
            return
Example #29
0
def load_pretrained_fasttext():
    return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/wiki.si.bin")
Example #30
0
def load_homemade_fasttext():
    # return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/fasttext_model_skipgram_300.bin")
    # return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/fasttext_model_skipgram_remove300_5.bin")
    return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/xxx.bin")
Example #31
0
from gensim.models.fasttext import FastText as ft

MODEL = "../model/cc.fr.300.bin"
MODEL2 = "../model/fr.bin"

WORDS = ["Ah non, c'est de la merde", 'Trop biiiieeenn', 'Excellent putain !']
tokens = ['enedis', 'energie', 'heureux', 'machine', 'propre', 'merde']

#classifier = load_model(MODEL)

model = ft.load_fasttext_format(MODEL2)

oov_vector = model[tokens]
print(oov_vector)
Example #32
0
 def __init__(self, language='en'):
     if language == 'en':
         super().__init__(FastText.load_fasttext_format(config.pretrained_fasttext_path))
     elif language == 'de':
         super().__init__(FastText.load_fasttext_format(config.pretrained_fasttext_de_path))
     print("the fasttext model loaded")
Example #33
0
def write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1'
    wsm = gs.models.Word2Vec.load(wg_path)
    cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/cc.sv.300.bin'
    csm = FastText.load_fasttext_format(cg_path)
    #cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext'
    #csm = FastText.load(cg_path)
    epit = epitran.Epitran('swe-Latn')

    col_names = [
        'sw1_charemb_score', 'sw2_charemb_score', 'blend_charemb_score',
        'sw1_sw2_charemb_sim', 'sw1_blend_charemb_sim',
        'sw2_blend_charemb_sim', 'sw1_wordemb_score', 'sw2_wordemb_score',
        'blend_wordemb_score', 'sw1_blend_wordemb_sim',
        'sw2_blend_wordemb_sim', 'sw1_sw2_wordemb_sim', 'splits',
        'sw1_sw2_char_bigramsim', 'sw2_sw1_char_bigramsim',
        'sw1_sw2_char_trigramsim', 'sw2_sw1_char_trigramsim', 'lcs_sw1_sw2',
        'sw1_blend_IPA_lev_dist', 'sw2_blend_IPA_lev_dist',
        'sw1_sw2_IPA_lev_dist', 'sw1_blend_lev_dist', 'sw2_blend_lev_dist',
        'sw1_sw2_lev_dist', 'sw1_graphemes', 'sw2_graphemes', 'sw1_syllables',
        'sw2_syllables', 'sw1_len', 'sw2_len', 'sw1_contrib', 'sw2_contrib',
        'sw1_sw2_removal', 'sw1_aff_c', 'sw1_N_c', 'sw2_aff_c', 'sw2_N_c',
        'sp1', 'sp2', 'sp3', 'LABEL', 'BLEND', 'CW1', 'CW2', 'CW1_split',
        'CW2_split'
    ]

    csvf = open('overlap_splitp_040918.csv'.format(lexicon), '+w', newline='')
    csvw = csv.DictWriter(csvf, delimiter=',', fieldnames=col_names)

    T, F = 0, 0

    dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle'

    with open(dataf, 'rb') as f:
        freqd = pickle.load(f)

    # overlap
    candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/'
    # noverlap
    #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/'

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        print('#', i, 'reading', blend)
        with open(candidate_folder + filename) as f:

            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                sw1, sw2 = gold_blends[blend]

                #print('### blend:', blend, 'gold:', (sw1, sw2), 'sample:', (cw1, cw2))
                feature_set = extract_sample_features(blend, cw1, cw2, lexicon,
                                                      corpus, sw1, sw2, freqd,
                                                      wsm, csm, epit)
                for features, label in feature_set:
                    #entry = list(map(lambda x: str(x), features.values()))
                    csvw.writerow(features)

    csvf.close()
Example #34
0
 def setUp(self):
     ft_home = os.environ.get('FT_HOME', None)
     self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
     self.test_model_file = datapath('lee_fasttext')
     self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
     self.test_new_model_file = datapath('lee_fasttext_new')
Example #35
0
 def test_load_model_supervised(self):
     with self.assertRaises(NotImplementedError):
         FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
Example #36
0
from tqdm import tqdm
from gensim.models.fasttext import FastText
from torch.autograd import Variable
from torch import nn
from pytorch_pretrained_bert import BertAdam
from pytorch_pretrained_bert.optimization import WarmupHalfLinearSchedule, WarmupCosineWithWarmupRestartsSchedule
from loss import FocalLoss

random.seed(0)
vis = visdom.Visdom()
EPOCH = 2000
jieba.load_userdict('bert-model/dict-traditional.txt')
jieba.suggest_freq('<newline>', True)

# Load vocabularies
word2vec = FastText.load_fasttext_format('bert-model/wordvec-large.dim1024')
vocab = {}
idx2vocab = {}
vec = []
with open('bert-model/TF.csv') as TF:
    print('建構詞向量...')
    for idx, line in enumerate(tqdm(TF)):
        term = line.split(',')[0]
        vocab[term] = idx
        idx2vocab[idx] = term
        vec.append(word2vec[term])

del word2vec

# BERT Model
model = modeling.BertNoEmbed(vocab=vocab, hidden_size=1024, enc_num_layer=3)