Beispiel #1
0
def train_embedding_fasttext():
    '''
    使用fasttext 生成字符级别和单词级别的词嵌入
    :return:
    '''
    # Skipgram model
    logging.info(
        'generating CHAR embedding %s with fasttext using %s algorithm',
        'char2vec_fastskip256', 'Skipgram')
    model = fasttext.skipgram(CHAR_LEVEL_CORPUS,
                              os.path.join(MODEL_DIR, 'char2vec_fastskip256'),
                              word_ngrams=2,
                              ws=5,
                              min_count=10,
                              dim=256)
    del (model)

    # CBOW model
    logging.info(
        'generating CHAR embedding %s with fasttext using %s algorithm',
        'char2vec_fastcbow256', 'CBOW')
    model = fasttext.cbow(CHAR_LEVEL_CORPUS,
                          os.path.join(MODEL_DIR, 'char2vec_fastcbow256'),
                          word_ngrams=2,
                          ws=5,
                          min_count=10,
                          dim=256)
    del (model)

    # Skipgram model
    logging.info(
        'generating WORD embedding %s with fasttext using %s algorithm',
        'word2vec_fastskip256', 'Skipgram')
    model = fasttext.skipgram(WORD_LEVEL_CORPUS,
                              os.path.join(MODEL_DIR, 'word2vec_fastskip256'),
                              word_ngrams=2,
                              ws=5,
                              min_count=10,
                              dim=256)
    del (model)

    # CBOW model
    logging.info(
        'generating WORD embedding %s with fasttext using %s algorithm',
        'word2vec_fastcbow256', 'CBOW')
    model = fasttext.cbow(WORD_LEVEL_CORPUS,
                          os.path.join(MODEL_DIR, 'word2vec_fastcbow256'),
                          word_ngrams=2,
                          ws=5,
                          min_count=10,
                          dim=256)
    del (model)
def execute():
    # Verify that mandatory arguments are present
    if "-i" not in args:
        return "ERROR: No input file was given"

    if "-t" not in args:
        return "ERROR: No model type was given"

    # Extract arguments
    train_file = args[args.index("-i")+1]
    model_type = args[args.index("-t")+1]

    # Extract optional arguments
    epoch = get_optional_param('--epoch',5)
    ngrams = get_optional_param('--ngrams',1)
    label_prefix = get_optional_param('--label','__label__')

    # Create temporary file
    tmp, modelname = tempfile.mkstemp()

    # Use specified classifier with parameters and output model to the name of the temporary file
    if model_type == "supervised":
        classifier = fasttext.supervised(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    elif model_type == "skipgram":
        classifier = fasttext.skipgram(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    elif model_type == "cbow":
        classifier = fasttext.cbow(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    # Return the temporary file name
    return modelname
Beispiel #3
0
 def train(self, txt_path, config=DEF_CONFIG):
     if self.mode == "skipgram":
         self.model = fasttext.skipgram(txt_path, self.model_path, **config)
     elif self.mode == "cbow":
         self.model = fasttext.cbow(txt_path, self.model_path, **config)
     elif self.mode == "supervised":
         self.model = fasttext.supervised(txt_path, self.model_path, **config)
Beispiel #4
0
def train_embedding_fasttext():
    
    # Skipgram model
    model = fasttext.skipgram(model_dir + 'train_char.txt', model_dir + 'char2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)

    # CBOW model
    model = fasttext.cbow(model_dir + 'train_char.txt', model_dir + 'char2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)

    # Skipgram model
    model = fasttext.skipgram(model_dir + 'train_word.txt', model_dir + 'word2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)

    # CBOW model
    model = fasttext.cbow(model_dir + 'train_word.txt', model_dir + 'word2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)
Beispiel #5
0
def trainCBOW(inFile,mdlfile):
    # CBOW model
    model = fasttext.cbow(input_file=inFile,\
                              output=mdlfile,\
                              lr=0.1,\
                              dim=200,\
                              epoch=10,\
                              word_ngrams=3,\
                              bucket=5000000)
Beispiel #6
0
def fasttext_model_from_file(file_path):
    save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1])
    try:
        model = fasttext.load_model(save_file_name + '.bin', encoding='utf-8')
        logging.info('model loaded:' + save_file_name)
    except ValueError:
        model = fasttext.cbow(file_path, const.FASTTEXT_PREFIX + file_path.split('/')[-1],
                              encoding='utf-8', min_count=1, lr=0.1)
    return model
def text2vec_fast(data_file, method='cbow', modelname='model', **kargs):
    clean_data(data_file, 'cleaned.txt')
    if method == 'skipgram':
        model = fasttext.skipgram('cleaned.txt', modelname, **kargs)
    else:
        model = fasttext.cbow('cleaned.txt', modelname, **kargs)

    vector_dict, index_dict = extract_word_vectors(modelname + '.vec')
    os.remove('cleaned.txt')
    return vector_dict, index_dict
def run_model(db_path, model_path, model_type, model_params):
    print('Training FastText model...')
    start_time = clock()
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    if not os.path.isdir(model_path):
        os.makedirs(model_path)
    temp_fpath = os.path.join(model_path, '__ft_temp.txt')

    get_text_command = 'SELECT preprocessed FROM rss_data ORDER BY cachedate DESC'
    results = cursor.execute(get_text_command)
    with open(temp_fpath, 'w', newline='', encoding='utf-8') as tmpfile:
        for r in results:
            if r[0] is not None:
                tmpfile.write(r[0] + '.\n')
    conn.close()

    model_fpath = os.path.join(model_path, 'fasttext_model')
    model = None
    p_ngrams = model_params.get('ngram', 1)
    p_dim = model_params.get('dim', 100)
    p_ws = model_params.get('ws', 5)
    p_epoch = model_params.get('epoch', 5)
    p_loss = model_params.get('loss', 'ns')
    p_min_count = model_params.get('min_count', 5)
    p_silent = model_params.get('silent', 1)
    if model_type == 'cbow':
        model = fasttext.cbow(temp_fpath,
                              model_fpath,
                              word_ngrams=p_ngrams,
                              dim=p_dim,
                              ws=p_ws,
                              epoch=p_epoch,
                              loss=p_loss,
                              silent=p_silent,
                              min_count=p_min_count)
    else:
        model = fasttext.skipgram(temp_fpath,
                                  model_fpath,
                                  word_ngrams=p_ngrams,
                                  dim=p_dim,
                                  ws=p_ws,
                                  epoch=p_epoch,
                                  loss=p_loss,
                                  silent=p_silent,
                                  min_count=p_min_count)

    os.remove(temp_fpath)

    end_time = clock()
    print('Model trained in {} seconds.'.format(int(end_time - start_time)))

    return model
Beispiel #9
0
def load_pretrained_fasttext():
    # Set FastText home to the path to the FastText executable
    ft_home = '/home/dev/fastText/fasttext'

    # Set file names for train and test data
    train_file = config.pos_path

    try:
        # Use FaceBook Corpus
        print(help(fasttext))
        fasttext.cbow(
            '/home/dev/tensormsa_jupyter/chap05_nlp/wordembedding/data/test3.txt',
            'model')
        model = gft.FastText.load_fasttext_format(
            '/home/dev/tensormsa_jupyter/chap05_nlp/wordembedding/data/test3.model'
        )
        result = model.most_similar(positive=['마법'])
        return result
    except Exception as e:
        raise Exception(e)
def fasttext_vectoriser_cbow(data):
    model = fasttext.cbow(data, 'model', dim=200)
    n_sample_size, n_features = size_of_vector_and_n_features_finder(df, model)
    vector_list = []
    for word in model.words:
        vector_list.append(model[word])
    myarray = np.asarray(vector_list)
    logger.info("previous array shape %s", myarray.shape)
    myarray = myarray.reshape(n_sample_size, n_features)
    logger.info("reshaped array %s", myarray.shape)
    return myarray
Beispiel #11
0
def train(inp = "wiki2.he.text",out_model = "wiki.he.fasttext.model", alg = "CBOW"):
    start = time.time()
    if alg == "skipgram":
        # Skipgram model
        model = fasttext.skipgram(inp, out_model)
        print(model.words) # list of words in dictionary
    else:
        # CBOW model
        model = fasttext.cbow(inp, out_model)
        print(model.words) # list of words in dictionary
    print(time.time()-start)
    model.save(out_model)
Beispiel #12
0
 def create_embeddings(self,
                       method='skipgram',
                       data=None,
                       overwrite=False,
                       dim=128,
                       context_size=5,
                       epochs=10,
                       min_count=5):
     """Train word embeddings using fasttext
     Args:
         method: Either skipgram or cbow
         data: Examples to use to get embeddings
         overwrite: Whether or not to overwrite existing embeddings file
         dim: Size of word vectors
         context_size: Size of the context window
         epochs: Number of epochs
         min_count: Minimal number of word occurences to have a vector
     """
     if data is None:
         data = self.data
     if method not in ['skipgram', 'cbow']:
         raise ValueError('Method must be skipgram or cbow')
     output_name = os.path.join(SAVE_LOCS['embeddings'], self.name)
     if overwrite or (not os.path.isfile(self.ft_model_file)):
         self._fasttext_preprocess(data)
         if method == 'skipgram':
             fasttext.skipgram(self.ft_input_file,
                               output_name,
                               dim=dim,
                               ws=context_size,
                               epoch=epochs,
                               min_count=min_count)
         else:
             fasttext.cbow(self.ft_input_file,
                           output_name,
                           dim=dim,
                           ws=context_size,
                           epoch=epochs,
                           min_count=min_count)
     self.load_fasttext_model()
Beispiel #13
0
def fasttext_model(tweets,
                   model="skipgram",
                   tweets_col="clean_tweet",
                   label_col="label"):
    create_labeled_csv(tweets, tweets_col, label_col)
    # Skipgram model
    if model == "skipgram":
        model = fasttext.skipgram("data/labels_train.txt", 'model_skipgram')
        #print(model.words) # list of words in dictionary
    elif model == "cbow":
        model = fasttext.cbow("data/labels_train.txt", 'model_cbow')
        #print(model.words) # list of words in dictionary

    return model
Beispiel #14
0
def load_fast_text_model(sentences):
    try:
        m = fasttext.load_model('fast_text_model.bin')
        print("trained model loaded")
        return m
    except:
        print("traning new model")
        with open('temp_file.txt','w') as temp_file:
            for sentence in sentences:
                temp_file.write(sentence)
        m = fasttext.cbow('temp_file.txt','fast_text_model')
        remove('temp_file.txt')
        print('model trained')
        return m
    def makeModel(self, method):
        print('Creating ' + method + ' model...')
        time_begin = datetime.datetime.now()
        if method == 'skip' and not os.path.isfile(self.fileModelS + '.vec'):
            model = fasttext.skipgram(self.fileData,
                                      self.fileModelS,
                                      lr=0.2,
                                      lr_update_rate=100,
                                      dim=600,
                                      ws=7,
                                      epoch=5,
                                      min_count=3,
                                      neg=5,
                                      word_ngrams=3,
                                      bucket=2000000,
                                      minn=3,
                                      maxn=6,
                                      thread=12,
                                      t=0.0001)
            self.modelSkip = model
        else:
            if method == 'cbow' and not os.path.isfile(self.fileModelC +
                                                       '.vec'):
                model = fasttext.cbow(self.fileData,
                                      self.fileModelC,
                                      lr=0.2,
                                      lr_update_rate=100,
                                      dim=600,
                                      ws=7,
                                      epoch=5,
                                      min_count=3,
                                      neg=5,
                                      word_ngrams=3,
                                      bucket=2000000,
                                      minn=3,
                                      maxn=6,
                                      thread=12,
                                      t=0.0001)
                self.modelCbow = model
        time_end = datetime.datetime.now()

        ## delete BIN file (not util for this project)
        if os.path.exists(self.fileModelC + '.bin'):
            os.remove(self.fileModelC + '.bin')
        if os.path.exists(self.fileModelS + '.bin'):
            os.remove(self.fileModelS + '.bin')

        print("Finished Training %s in %s" %
              (method, str(time_end - time_begin)))
Beispiel #16
0
    def createClassifier(self):
        """ 构造并训练fasttext模型,保存到'/classifier.model.bin'中

        Return:
        """

        dataSetCreator = DataSetCreator()

        if not os.path.exists(self.saveVectorFile + '.vec'):
            fasttext.cbow(self.combinedDataSaveFile,
                          self.saveVectorFile,
                          dim=self.w2vdim,
                          epoch=self.trainw2vepoch)

        if not os.path.exists(self.trainClassifierFile):
            dataSetCreator.dataSetforClassify(self.trainClassifierFile)

        self.classifier = fasttext.supervised(
            self.trainClassifierFile,
            self.classifierModel,
            dim=self.w2vdim,
            epoch=self.trainClassifierEpoch,
            label_prefix='__label__',
            pretrained_vectors=self.saveVectorFile + '.vec')
Beispiel #17
0
    def test_train_cbow_model(self):
        # set params
        lr=0.005
        dim=10
        ws=5
        epoch=5
        min_count=1
        neg=5
        word_ngrams=1
        loss='ns'
        bucket=2000000
        minn=3
        maxn=6
        thread=4
        lr_update_rate=10000
        t=1e-4
        silent=1

        # train cbow model
        model = ft.cbow(input_file, output, lr, dim, ws, epoch, min_count,
                neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate,
                t, silent)

        # Make sure the model is generated correctly
        self.assertEqual(model.dim, dim)
        self.assertEqual(model.ws, ws)
        self.assertEqual(model.epoch, epoch)
        self.assertEqual(model.min_count, min_count)
        self.assertEqual(model.neg, neg)
        self.assertEqual(model.loss_name, loss)
        self.assertEqual(model.bucket, bucket)
        self.assertEqual(model.minn, minn)
        self.assertEqual(model.maxn, maxn)
        self.assertEqual(model.lr_update_rate, lr_update_rate)
        self.assertEqual(model.t, t)

        # Make sure .bin and .vec are generated
        self.assertTrue(path.isfile(output + '.bin'))
        self.assertTrue(path.isfile(output + '.vec'))

        # Make sure the vector have the right dimension
        self.assertEqual(len(model['the']), dim)

        # Make sure we support unicode character
        unicode_str = 'Καλημέρα'
        self.assertTrue(unicode_str in model.words)
        self.assertTrue(unicode_str in model)
        self.assertEqual(len(model[unicode_str]), model.dim)
Beispiel #18
0
def train_fasttext_model():
    # Skipgram model
    # equals to: `./fasttext skipgram -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_sg`
    model_sg = fasttext.skipgram("../data/880w_news_title_content_seg_sort_uniq_head_2.txt", "../data/lxw_model_sg")
    # 自动生成文件../data/lxw_model_sg.bin 和 ../data/lxw_model_sg.vec
    print(model_sg.words)    # list of words in dictionary

    # CBOW model
    # equals to: `./fasttext cbow -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_sg`
    model_cbow = fasttext.cbow("../data/880w_news_title_content_seg_sort_uniq_head_2.txt", "../data/lxw_model_cbow")
    # 自动生成文件../data/lxw_model_cbow.bin 和 ../data/lxw_model_cbow.vec
    print(model_cbow.words)    # list of words in dictionary
    print(type(model_cbow.words))    # <class 'set'>
    # NOTE: 生成的两个.vec文件针对同一个词的向量是不同的

    print("intersection:{}".format(model_sg.words - model_cbow.words))    # intersection:set()
    print("intersection:{}".format(model_cbow.words - model_sg.words))    # intersection:set()
def char_ft_main_more():
    import fasttext
    embed_file = 'data/char_embed_corpus.txt'
    for windows in [3, 5, 8]:
        out_file = 'data/char_ft_embed_{}_{}.model'.format(windows, 1)
        model = fasttext.skipgram(embed_file,
                                  out_file,
                                  ws=windows,
                                  lr=0.1,
                                  dim=300,
                                  silent=0)
        out_file = 'data/char_ft_embed_{}_{}.model'.format(windows, 0)
        model = fasttext.cbow(embed_file,
                              out_file,
                              ws=windows,
                              lr=0.1,
                              dim=300,
                              silent=0)
    print('done.')
Beispiel #20
0
    def test_train_cbow_model_default(self):
        default_args = default_params.read_file(params_txt)
        model = ft.cbow(input_file, output)

        # Make sure the default params of cbow is equal
        # to fasttext(1) default params
        self.assertEqual(model.model_name, 'cbow')
        self.assertEqual(model.dim, int(default_args['dim']))
        self.assertEqual(model.ws, int(default_args['ws']))
        self.assertEqual(model.epoch, int(default_args['epoch']))
        self.assertEqual(model.min_count, int(default_args['minCount']))
        self.assertEqual(model.neg, int(default_args['neg']))
        self.assertEqual(model.word_ngrams, int(default_args['wordNgrams']))
        self.assertEqual(model.loss_name, default_args['loss'])
        self.assertEqual(model.bucket, int(default_args['bucket']))
        self.assertEqual(model.minn, int(default_args['minn']))
        self.assertEqual(model.maxn, int(default_args['maxn']))
        self.assertEqual(model.lr_update_rate,
                float(default_args['lrUpdateRate']))
        self.assertEqual(model.t, float(default_args['t']))
Beispiel #21
0
    def train(self, input_filename=SENTENCES_FILENAME,
              model_filename=None,
              model_type='skipgram'):
        """Train a fasttext model.

        Parameters
        ----------
        input_filename : str, optional
            Filename for input file with sentences.
        model_filename : str, optional
            Filename for model output.
        model_type : skipgram or cbow, optional
            Model type.

        """
        if model_filename is None:
            if model_type == 'skipgram':
                model_filename = FAST_TEXT_SKIPGRAM_MODEL_FILENAME
            elif model_type == 'cbow':
                model_filename = FAST_TEXT_CBOW_MODEL_FILENAME

        full_model_filename = self.full_filename(model_filename)
        full_input_filename = self.full_filename(input_filename)

        if model_type == 'skipgram':
            self.logger.info(
                'Training fasttext skipgram model on {} to {}'.format(
                    full_input_filename, full_model_filename))
            self.model = fasttext.skipgram(
                full_input_filename, full_model_filename)
        elif model_type == 'cbow':
            self.logger.info(
                'Training fasttext cbow model on {} to {}'.format(
                    full_input_filename, full_model_filename))
            self.model = fasttext.cbow(
                full_input_filename, full_model_filename)
        else:
            raise ValueError('Wrong argument to model_type')

        # Invalidate computed normalized matrix
        self._normalized_matrix = None
Beispiel #22
0
 def train(self):
     if self.sg:
         self.model = skipgram(self.src_file,
                               self.model_file,
                               dim=self.dim,
                               ws=self.ws,
                               min_count=self.min_count,
                               lr=self.lr,
                               lr_update_rate=self.lr_update_rate,
                               epoch=self.epoch,
                               word_ngrams=self.word_ngrams,
                               thread=self.thread)
     else:
         self.model = cbow(self.src_file,
                           self.model_file,
                           dim=self.dim,
                           ws=self.ws,
                           min_count=self.min_count,
                           lr=self.lr,
                           lr_update_rate=self.lr_update_rate,
                           epoch=self.epoch,
                           word_ngrams=self.word_ngrams,
                           thread=self.thread)
    def train_model(self, file_input='train_text.txt'):
        """
        :param file_input:
        :return:
        """
        input_path = os.path.join(DATA_DIR, file_input)

        if self.embedding_type == 'skipgram':
            # Skipgram model
            model = fasttext.skipgram(input_path, self.model_path)

        elif self.embedding_type == 'cbow':
            # CBOW model
            model = fasttext.cbow(input_path, self.model_path)

        else:
            raise NotImplementedError()

        print(model.words)  # list of words in dictionary

        self.model = model

        return model
Beispiel #24
0
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 23 12:38:21 2017

@author: nabeel
     test classification with fast text
"""

import fasttext

# Skipgram model
model = fasttext.skipgram('data.txt', 'model')
print model.words  # list of words in dictionary

# CBOW model
model = fasttext.cbow('data.txt', 'model')
print model.words  # list of words in dictionary

print model['king']  # get the vector of the word 'king'
model = fasttext.load_model('model.bin')
print model.words  # list of words in dictionary
print model['king']  # get the vector of the word 'king'
classifier = fasttext.supervised('data.train.txt', 'model')
classifier = fasttext.supervised('data.train.txt',
                                 'model',
                                 label_prefix='__label__')
result = classifier.test('test.txt')
print 'P@1:', result.precision
print 'R@1:', result.recall
print 'Number of examples:', result.nexamples
texts = ['example very long text 1', 'example very longtext 2']
 def train(self):
     return fasttext.cbow(self.train_file, self.model_path)
    4: 'military',
    5: 'sports'
}

texts = [
    '中新网 日电 2018 预赛 亚洲区 强赛 中国队 韩国队 较量 比赛 上半场 分钟 主场 作战 中国队 率先 打破 场上 僵局 利用 角球 机会 大宝 前点 攻门 得手 中国队 领先'
]
labels = classifier.predict(texts)
print(labels)
print(label_to_cate[int(labels[0][0])])

#Top K 个预测结果
labels = classifier.predict(texts, k=3)
print(labels)

import fasttext

# Skipgram model
model = fasttext.skipgram('unsupervised_train_data.txt', 'model')
print(model.words)

# CBOW model
model = fasttext.cbow('unsupervised_train_data.txt', 'model')
print(model.words)

#对比gensim的word2vec

model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
model.save("gensim_word2vec.model")
model.wv['赛季']
model.wv.most_similar('赛季')
Beispiel #27
0
			self.n_thread=th
			self.samplet=t
			self.silent=verbosec
			self.enc=encoding	

	def fit(self,X,modelname='model'):
                """
                Input: takes input file in format
                returns classifier object
                to do: add option to feed list of X and Y or file
                to do: check options for the api call 
                to do: write unit test
                """
		try:
                    if not csvflag:
                        self.model=ft.cbow(X, modelname, lr=self.lr, dim=self.dim,lr_update_rate=self.lr_update_rate,epoch=self.epoch,bucket=self.bucket,loss=self.loss,thread=self.n_thread)
                except:
                    print("Error in input dataset format")
                    
	def getproperties(self):
		"""
                Input: Nothing, other than object self pointer
                Return: None , prints the descriptions of the model hyperparameters
                """
                print("The model has following hyperparameters as part of its specification")
                print("Learning rate :"+ str(lr))
                print("Learning rate update after "+str(self.lr_update_rate)+"iterations")
                print("Embedding size: "+str(self.dim))
                print("Epochs :"+ str(self.epochs)
                print("minimal number of word occurences: "+self.min_count)
                print("number of negatives sampled :"+str(self.neg))
OUTPUT_PATH_CBOW = '/tmp/cbow'

# Learn the word representation using skipgram model
#skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH_SKIPGRAM, lr=0.02, dim=300, ws=5, \
#        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, \
#        thread=4, t=1e-4, lr_update_rate=100)
skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH_SKIPGRAM)

# Get the vector of some word
print(skipgram['word'])

# Learn the word representation using cbow model
#cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH_CBOW, lr=0.02, dim=300, ws=5, \
#        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, \
#        thread=4, t=1e-4, lr_update_rate=100)
cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH_CBOW)

# Get the vector of some word
print(cbow['word'])

# Load pre-trained skipgram model
SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin'
skipgram = fasttext.load_model(SKIPGRAM_BIN)
print(skipgram['word'])

# Load pre-trained cbow model
CBOW_BIN = OUTPUT_PATH_CBOW + '.bin'
cbow = fasttext.load_model(CBOW_BIN)
print(cbow['word'])

Beispiel #29
0
def buildEmbedding():
    textList = getTextList(preTrainDF, preTestDF, semiTrainDF, semiTestDF)
    with codecs.open('./dic&corp/text.txt', 'w', 'utf-8') as fw:
        for text in textList:
            fw.write(text + '\n')
    model = fasttext.cbow('./dic&corp/text.txt', 'model.cbow', dim=128)
Beispiel #30
0
INPUT_TXT = '/path/to/file.txt'
OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram'
OUTPUT_PATH_CBOW = '/tmp/cbow'

# Learn the word representation using skipgram model
skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5,
        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
        thread=4, t=1e-4, lr_update_rate=100)

# Get the vector of some word
print skipgram['word']

# Learn the word representation using cbow model
cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5,
        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
        thread=4, t=1e-4, lr_update_rate=100)

# Get the vector of some word
print cbow['word']

# Load pre-trained skipgram model
SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin'
skipgram = fasttext.load_model(SKIPGRAM_BIN)
print skipgram['word']

# Load pre-trained cbow model
CBOW_BIN = OUTPUT_PATH_CBOW + '.bin'
cbow = fasttext.load_model(CBOW_BIN)
print cbow['word']
Beispiel #31
0
fonm = open('ftext_name.txt', 'w')
for nm, ct, ds in zip(merge.name_token.str.lower(),
                      merge.category_name_split.str.lower(),
                      merge.description_token.str.lower()):
    fonm.write(
        '%s %s %s\n' %
        (ct.encode('ascii', 'ignore').lower(), ds.encode(
            'ascii', 'ignore').lower(), nm.encode('ascii', 'ignore').lower()))
fonm.close()

print('[{}] Start fasttext training'.format(time.time() - start_time))
model = fasttext.cbow('ftext_name.txt',
                      'model',
                      dim=24,
                      ws=4,
                      lr=.05,
                      min_count=1,
                      thread=8,
                      epoch=4,
                      silent=0)
modelcb = FastText('model.bin')
print('[{}] Start fasttext mat creation'.format(time.time() - start_time))

ftmat = np.zeros((merge.shape[0], 24))
for c, vals in tqdm(enumerate(merge[['category_name', 'name']].values)):
    ftmat[c] = modelcb.get_numpy_sentence_vector(
        '%s %s' % (vals[0].replace('/', ' '), vals[1]))
ftmat = pd.DataFrame(ftmat)
print('[{}] Finished fasttext mat creation'.format(time.time() - start_time))
ftmat.head()
'''