def train_save_word_embeddings(corpus_file_path, algo, vector_dim, word2vec_format=False, save_model=False,
                               architecture='cbow', train_algorithm='negative', workers=4):
    """architecture: 'skim-gram' or 'cbow'. train_algorithm: 'softmax' or 'negative'"""
    sentences = MySentences(corpus_file_path)
    arch = 1 if architecture == 'skip-gram' else 0
    train = 1 if train_algorithm == 'softmax' else 0
    print('Training %s with size %d' % (algo, vector_dim))
    if algo == 'word2vec':
        model = Word2Vec(sentences=sentences, size=vector_dim, workers=workers, sg=arch, hs=train)
    else:  # fasttext
        model = FastText(sentences=sentences, size=vector_dim, workers=workers, sg=arch, hs=train)
    print('Done!')
    s = algo + '_'
    for name in corpus_file_path:
        s += name.split('/')[-1].split('.')[0] + '_'
    filename = "{0}{1}_{2}_{3}".format(s, vector_dim, architecture, train_algorithm)
    if save_model:
        print('Saving model in {0}.model'.format(filename))
        model.save(os.path.join(word2vec_path, "{}.model".format(filename)))
    if word2vec_format:
        print('Saving word embeddings in original C word2vec (.txt) format in {}.txt'.format(filename))
        model.wv.save_word2vec_format(os.path.join(word2vec_path, "{}.txt".format(filename)))
    print('Saving word embeddings in {0}.kv'.format(filename))
    model.wv.save(os.path.join(save_path(algo), "{}.kv".format(filename)))
    print('Saved!')
    return model
def fastText(root, sentences, embedding_size=200, model_type=0, min_count=1, workers=4):
    """
    Trains and saves a fastText model on a given list of sentences

    Params:
        sentences: list of tokenized sentences
        embedding_size: size of the embeddings
        model_type: int, 0: CBOW, 1: skip-gram
        min_count:
        worker: number of workers for parallelization

    The saved file stores the model in a format compatible with the original word2vec implementation.

    https://groups.google.com/forum/#!topic/gensim/RhiwLU0vP1A

    """
    logger.info("Train the fasttext model")
    model = FastText(sentences, min_count=min_count, workers=workers, sg=model_type, size=embedding_size)

    logger.info("Save the trained model")
    model.save(root+'fasttext_model')

    logger.info("Save the model in word2vec format")
    fname = 'fasttext_word2vecFormat'
    model.wv.save_word2vec_format(root+fname)
Beispiel #3
0
class ModelFastText:
    def __init__(self, path, existModel=False):
        if existModel:
            self.loadModel(path)
        else:
            self.createModel(path)

    def createModel(self, pathTrain, size=300, min_count=50, sg=1, workers=8, progress_per=50000):
        self.model = FT_gensim(size=300, min_count=50, sg=1, workers=8)
        sentences = datapath(pathTrain)
        self.model.build_vocab(corpus_file=sentences, progress_per=50000)

    def loadModel(self, path):

        self.model = FT_gensim.load(path)

    def trainModel(self, pathTrain, epochs=5, compute_loss=False):
        sentences = datapath(pathTrain)

        self.model.train(sentences, epochs=5, total_examples=self.model.corpus_count, compute_loss=False)

    def saveModel(self, nameFile):
        self.model.save(nameFile+".model")

    def getSimilar(self, word):
        return self.model.wv.most_similar(word, topn=50)
Beispiel #4
0
def build_fast_text_model(fasttext_entity_path):
    # build fastText

    fasttext_params = {
        "hs": 1,
        "window": 10,
        "min_count": 1,
        "workers": 7,
        "min_n": 1,
        "max_n": 10,
    }

    print("building corpus")

    entity_corpus = [entity for entity in entity_generator(entity_collection)]
    fasttext_entity = FastText(**fasttext_params)

    print("count corpus")
    fasttext_entity.build_vocab(sentences=entity_corpus)
    total_examples = fasttext_entity.corpus_count

    print("train fasttext")
    fasttext_entity.train(sentences=entity_corpus,
                          total_examples=total_examples,
                          epochs=5)

    print("saving fasttext")

    fasttext_entity.save(fasttext_entity_path)

    return fasttext_entity
def generate_model():
    comments = []
    for comment in unlabeledData["comment"]:
        comments += to_separate_sentences(comment)

    print("# of comments taken for building the model: " + str(len(comments)))

    # num_features = 1000  # Word vector dimensionality
    # context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words
    min_word_count = 1  # Minimum word count - if not occurred this much remove
    num_workers = 4  # Number of threads to run in parallel

    model = FastText(comments,
                     workers=num_workers,
                     size=num_features,
                     min_count=min_word_count,
                     window=context,
                     sample=downsampling,
                     sg=1,
                     iter=50)
    # model.init_sims(replace=True)  # If you don't plan to train the model any further
    model.save(fasttext_model)

    check_model_qulity(model, 'නැහැ')
    return
Beispiel #6
0
def main():

    skills_train = []
    for each_skill in data["clean_skills"]:
        if each_skill != ['']:
            skills_train.append(each_skill)

    embedding_size = 60
    window_size = 40
    min_word = 5
    down_sampling = 1e-2

    model = FastText(skills_train,
                     size=embedding_size,
                     window=window_size,
                     min_count=min_word,
                     sample=down_sampling,
                     sg=1,
                     iter=100)

    model.init_sims(replace=True)
    print(model)

    fname = get_tmpfile("fasttext.model")
    model.save(fname)
Beispiel #7
0
class FastTextTrainer(object):
    """
    Perform training and save gensim FastText
    """

    def __init__(self, min_count=2, size=200, workers=4, window=3, iter=10):
        self.min_count = min_count
        self.size = size
        self.workers = workers
        self.window = window
        self.iter = iter
        self.model = None

    def train(self, corpus, sg=0, callbacks=None):
        self.model = FastText(corpus, callbacks=callbacks, min_count=self.min_count, size=self.size,
                              workers=self.workers, window=self.window, iter=self.iter, sg=sg)

    def save(self, filename):
        self.model.save(filename)

    def get_model(self):
        return self.model

    def load_model(self, filename):
        return FastText.load(filename)

    def load_google_model(self, filename):
        return FastText.load_fasttext_format(filename)

    def retrain(self, model, corpus, sg=0, iter=10, callbacks=None):
        model.train(corpus, total_examples=model.corpus_count, epochs=iter, callbacks=callbacks)
        self.model = model
Beispiel #8
0
 def test_online_learning_after_save(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(tmpf)
     model_neg = FT_gensim.load(tmpf)
     self.assertTrue(len(model_neg.wv.vocab), 12)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
Beispiel #9
0
 def test_online_learning_after_save(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(tmpf)
     model_neg = FT_gensim.load(tmpf)
     self.assertTrue(len(model_neg.wv.vocab), 12)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
def build_model(data_path,
                sg=0,
                hs=0,
                negative=5,
                size=100,
                min_count=5,
                workers=8,
                window=5,
                model_type='word2vec'):
    if model_type == 'word2vec':
        model = Word2Vec(LineSentence(data_path),
                         sg=sg,
                         workers=workers,
                         min_count=min_count,
                         size=size,
                         hs=hs,
                         negative=negative,
                         window=window)
    elif model_type == 'fasttext':
        model = FastText(LineSentence(data_path),
                         sg=sg,
                         workers=workers,
                         min_count=min_count,
                         size=size,
                         hs=hs,
                         negative=negative,
                         window=window)
    else:
        raise Exception('Model type is not supported.')
    vectors = model.wv.vectors
    vocabulary = model.wv.index2word
    with open('index_to_vectors.txt', 'w+', encoding='utf-8') as vectors_file:
        content = '\n'.join([
            '{}\t{}'.format(i + 1, ','.join([str(item) for item in vector]))
            for i, vector in enumerate(vectors)
        ])
        content = '{}\t{}\n'.format(0, ','.join(['0'] * size)) + content
        vectors_file.write(content)
    with open('data/word_to_vectors.txt', 'w+',
              encoding='utf-8') as vectors_file:
        content = '\n'.join([
            '{}\t{}'.format(vocabulary[i],
                            ','.join([str(item) for item in vector]))
            for i, vector in enumerate(vectors)
        ])
        content = '{}\t{}\n'.format('<pad>', ','.join(['0'] * size)) + content
        vectors_file.write(content)
    with open('data/vocabulary.txt', 'w+', encoding='utf-8') as vocab_file:
        content = '\n'.join([
            '{}\t{}'.format(i + 1, str(vocab))
            for i, vocab in enumerate(vocabulary)
        ])
        content = '{}\t{}\n'.format(0, '<pad>') + content
        vocab_file.write(content)
    model_path = 'word2vec.model'
    model.save(model_path)
    return model_path
Beispiel #11
0
 def test_online_learning_after_save(self):
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(testfile())
     model_neg = FT_gensim.load(testfile())
     self.assertTrue(len(model_neg.wv.vocab), 12)
     self.assertTrue(len(model_neg.wv.ngrams), 202)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
     self.assertTrue(len(model_neg.wv.ngrams), 271)
Beispiel #12
0
def fasttext_train_model():
    # read corpus
    with open(corpus_dir) as fp:
        corpus = fp.readlines()  # list of string

    model = FastText(size=embed_size, window=3, min_count=1)  # instantiate
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus),
                epochs=epoch)  # train
    model.save(model_dir)  # save model
Beispiel #13
0
def create_FastText_model(skip_gram, tokenized_sentences, model_path):
    try:
        model = FastText.load(model_path)
    except:
        model = FastText(min_count=1, window=5, sg=skip_gram)
        model.build_vocab(sentences=tokenized_sentences)
        model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=10)

        model.save(model_path)

    return model
Beispiel #14
0
def train_word_vectors(modeltype, sentences, path, sg, size, mincount):
    print(path)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    if modeltype == 'word2vec':
        model = Word2Vec(sentences, sg=sg, size=size, min_count=mincount)
        model.save(path)
    elif modeltype == 'fasttext':
        model = FastText(sentences, sg=sg, size=size, min_count=mincount)
        model.save(path)
    else:
        raise Exception("%s is not valid model type" % modeltype)
Beispiel #15
0
 def test_persistence(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model = FT_gensim(sentences, min_count=1)
     model.save(tmpf)
     self.models_equal(model, FT_gensim.load(tmpf))
     #  test persistence of the KeyedVectors of a model
     wv = model.wv
     wv.save(tmpf)
     loaded_wv = FastTextKeyedVectors.load(tmpf)
     self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
     self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
Beispiel #16
0
 def test_persistence(self):
     model = FT_gensim(sentences, min_count=1)
     model.save(testfile())
     self.models_equal(model, FT_gensim.load(testfile()))
     #  test persistence of the KeyedVectors of a model
     wv = model.wv
     wv.save(testfile())
     loaded_wv = FastTextKeyedVectors.load(testfile())
     self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
     self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
     self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
Beispiel #18
0
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
Beispiel #19
0
def generate_model(lang):

    model_gensim = FT_gensim(size=300)

    # build the vocabulary
    model_gensim.build_vocab(corpus_file='embedding/corpus_' + lang)

    # train the model
    model_gensim.train(corpus_file='embedding/corpus_' + lang,
                       epochs=model_gensim.epochs,
                       total_examples=model_gensim.corpus_count,
                       total_words=model_gensim.corpus_total_words)

    model_gensim.save('embedding/fasttext_' + lang + '.vec')
Beispiel #20
0
def main():
    print('Instantiating the model')
    model = FT_gensim(size=100, window=5, min_count=5)  # instantiate the model
    print('Building the vocabulary')
    model.build_vocab(sentences=MyIter())
    total_examples = model.corpus_count
    print('Training the model')
    model.train(sentences=MyIter(), total_examples=total_examples,
                epochs=5)  # train the model

    ## Save the model (can be loaded using gensim)
    print('Saving the model to specified filepath')
    save_file = sys.argv[2]
    model.save(save_file)
Beispiel #21
0
    def test_norm_vectors_not_saved(self):
        tmpf = get_tmpfile('gensim_fasttext.tst')
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(tmpf)
        loaded_model = FT_gensim.load(tmpf)
        self.assertTrue(loaded_model.wv.vectors_norm is None)
        self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None)

        wv = model.wv
        wv.save(tmpf)
        loaded_kv = FastTextKeyedVectors.load(tmpf)
        self.assertTrue(loaded_kv.vectors_norm is None)
        self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
Beispiel #22
0
def word_embedding(tokens):
    """
    訓練詞向量並儲存

    Args:
        tokens (list): 斷詞的tokens
    """
    # input must be 2D even only one data
    # tokens = [['xx', 'xxx']]
    model = FastText([tokens], min_count=1, size=3)
    get_vector = model.wv['半導體']
    model.save('./data/wb.model')
    print(get_vector)
    print(model.wv.vocab)
Beispiel #23
0
    def test_persistence_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model = FT_gensim(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, FT_gensim.load(tmpf))
            #  test persistence of the KeyedVectors of a model
            wv = model.wv
            wv.save(tmpf)
            loaded_wv = FastTextKeyedVectors.load(tmpf)
            self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
            self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def createFastTextModel(data, isSG, vectorSize, maxNgram, modelFilePath):
    # train model
    model_gensim = FT_gensim(sg=isSG,
                             size=vectorSize,
                             min_count=1,
                             min_n=1,
                             max_n=maxNgram)
    # build the vocabulary
    model_gensim.build_vocab(data)
    # train the model
    model_gensim.train(data,
                       total_examples=model_gensim.corpus_count,
                       epochs=model_gensim.iter)
    #save
    model_gensim.save(modelFilePath)
Beispiel #25
0
    def test_online_learning_after_save_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
            model_neg.save(tmpf)
            model_neg = FT_gensim.load(tmpf)
            self.assertTrue(len(model_neg.wv.vocab), 12)
            model_neg.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words,
                            epochs=model_neg.iter)
            self.assertEqual(len(model_neg.wv.vocab), 14)
def embedding_creation(args, full_article):
    word_punctuation_tokenizer = nltk.WordPunctTokenizer()
    word_tokenized_corpus = [
        word_punctuation_tokenizer.tokenize(sent) for sent in full_article
    ]

    embedd_model = FastText(word_tokenized_corpus,
                            size=args.embedding_size,
                            window=args.window_size,
                            min_count=args.min_word,
                            sample=args.down_sampling,
                            sg=0,
                            iter=50)
    print('Finished and saving model at location', args.embeddingPath)
    embedd_model.save(args.embeddingPath + args.embedding_Model)
    return
def fast_text_trainer(df, model_path):
    list_of_tokens = list(df["description"])
    if isinstance(list_of_tokens[0], str):
        # list_of_tokens = [literal_eval(x) for x in list_of_tokens]
        tokenized_data = df[['description']].applymap(lambda s: word_tokenize(s))
        list_of_tokens = list(tokenized_data["description"])
    start_time = time.time()
    model = FastText(sentences=list_of_tokens,
                     sg=1,
                     size=60,
                     window=5,
                     min_count=1,
                     workers=3,
                     iter=30)
    print("Time taken to train the fast_text model: " + str(int((time.time() - start_time) / 60)) + ' minutes\n')
    model.save(model_path)
Beispiel #28
0
def train_fasttext_model(output_model_path,
                         iter_docs,
                         size=300,
                         window=8,
                         min_count=5,
                         sg=1,
                         epoch=5):
    """
    Parameters
    ----------
    output_model_path : string
        path of fastText model
    iter_docs : iterator
        iterator of documents, which are raw texts
    size : int
        size of word vector
    window : int
        window size of word2vec
    min_count : int
        minimum word count
    sg : int
        word2vec training algorithm (1: skip-gram other:CBOW)
    epoch : int
        number of epochs
    """
    logging.info("build vocabularies")

    model = FastText(size=size,
                     window=window,
                     min_count=min_count,
                     sg=sg,
                     workers=multiprocessing.cpu_count())
    model.build_vocab(iter_docs())

    logging.info("train fasttext")

    model.train(iter_docs(), total_examples=model.corpus_count, epochs=epoch)
    model.init_sims(replace=True)

    logging.info("save model")

    p = Path(output_model_path)
    if not p.parent.exists():
        p.parent.mkdir(parents=True)
    model.save(output_model_path)

    logging.info("done.")
Beispiel #29
0
def instantiateEmbenddingMatrix(corpus, tokenizer,
                                vocabulary_size,sequence_length, 
                                feature_size , window_context = 3, min_word_count = 10,
                                sample = 1e-3, sg=0, overwrite=False, load=True):

    if load == True:
        try:
            embedding_matrix = None
            print("Loading embedding matrix...")
            embedding_matrix = np.genfromtxt('embedding.csv', delimiter=',')
            ft_model = FastText.load("ft_model.model")
            
        except:
            embedding_matrix = None
            pass
    else:
        embedding_matrix = None
    if embedding_matrix is None or overwrite or load == False:
        # Word vector dimensionality  
        # Context window size                                                                                    
        # Minimum word count                        
        # Downsample setting for frequent words
        # sg decides whether to use the skip-gram model (1) or CBOW (0)
        ft_model = FastText(corpus,min_n=0,max_n=3
                            , size=feature_size, 
                            window=window_context,min_count=min_word_count,sample=sample, sg=sg, iter=1000)    

        ft_model.save("ft_model.model")
        print('Preparing embedding matrix...')
        words_not_found = []
        nb_words = vocabulary_size
        word_index = tokenizer.word_index
        embedding_matrix = np.zeros((nb_words, feature_size))
        for word, i in word_index.items():
            if i >= nb_words:
                continue
            embedding_vector = ft_model.wv.get_vector(word)
            if (embedding_vector is not None) and len(embedding_vector) > 0:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
            else:
                words_not_found.append(word)
        print(embedding_matrix.shape)
        print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
        np.savetxt('embedding.csv', embedding_matrix, delimiter=',')
            
    return embedding_matrix,ft_model
Beispiel #30
0
def BuildFastText(reviews, label):
    #input: sentences da dc tokenize thanh list
    model = FastText(size=200, window=3, min_count=4)
    model.build_vocab(sentences=reviews)
    model.train(sentences=reviews, total_examples=len(reviews), epochs=50)
    # fname = get_tmpfile("Model/fasttext.model")
    model.save("Model/fasttext.bin")
    print("Save model done!")
    a = 0
    b = 0
    dataf = list()
    for review in reviews:
        # print(a)
        len_sen = len(review)
        # print(type(len_sen))
        try:
            vectors = model.wv[review]
            # print(type(vectors))
            sumvec = 0
            for i in range(0, len_sen):
                sumvec = sumvec + vectors[i]
            sumvec = sumvec / len_sen
            # print(type(sumvec))
            # print(sumvec)
            dataf.append(sumvec)
        except:
            sumvec = 0
            # print(a)
            # print(review)
            b = b + 1
            if (label[a] == 0):
                sumvec = model.wv["positive"]
                print(type(sumvec))
            if (label[a] == 1):
                sumvec = model.wv["negative"]
            print(type(sumvec))
            dataf.append(sumvec)
        # print(sumvec)
        # print(type(sumvec))
    # print(len(vectors))
    # if (len(vectors) == 0):
    #     print(a)
    # a = a + 1
    #5953 #6253 #11001 #11801 #12051 #13937 #14005

    dataf = np.array(dataf)
    return dataf
 def test_online_learning_after_save(self):
     model_neg = FT_gensim(sentences,
                           size=10,
                           min_count=0,
                           seed=42,
                           hs=0,
                           negative=5)
     model_neg.save(testfile())
     model_neg = FT_gensim.load(testfile())
     self.assertTrue(len(model_neg.wv.vocab), 12)
     self.assertTrue(len(model_neg.wv.ngrams), 202)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences,
                     total_examples=model_neg.corpus_count,
                     epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
     self.assertTrue(len(model_neg.wv.ngrams), 271)
Beispiel #32
0
def fast_text_training(merger_seg_path,
                       model_path,
                       min_count=5,
                       embeding_size=200):
    '''
    使用fast_text方法训练
    :param merger_seg_path:
    :param min_count:
    :param embeding_size:
    :return:
    '''
    model = FastText(sentences=LineSentence(merger_seg_path),
                     workers=8,
                     min_count=5,
                     size=200)
    model.save(model_path)
    print("奇瑞最相近的词汇:\n", model.wv.most_similar(['奇瑞'], topn=10))
def create_FastText_model(model_path, skip_gram, epochs_count,
                          tokenized_sentences):
    try:
        model = FastText.load(model_path)
    except:
        model = FastText(min_n=1,
                         hs=True,
                         alpha=0.1,
                         min_alpha=0.1,
                         sg=skip_gram)
        model.build_vocab(sentences=tokenized_sentences)
        model.train(sentences=tokenized_sentences,
                    total_examples=len(tokenized_sentences),
                    epochs=epochs_count)

        model.save(model_path)

    return model
def main(args):
    sentences = gensim.models.word2vec.Text8Corpus(args.corpus)

    my_model = FastText(sentences=sentences,
                        size=int(args.size),
                        window=int(args.window),
                        min_count=int(args.min_count),
                        workers=int(args.workers),
                        alpha=float(args.alpha),
                        sample=float(args.subsample),
                        negative=int(args.negative),
                        sorted_vocab=True,
                        iter=int(args.epochs))

    my_model.save(
        os.path.join(file_dir, '../allnews_am/models/', args.model_name))
    analogy_file = os.path.join(file_dir,
                                '../allnews_am/data/yerevann_analogies.txt')
    my_model.wv.evaluate_word_analogies(analogy_file)