Esempio n. 1
0
# Initialize the model
model = FT_gensim(size=32)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.epochs = 15
model.train(corpus_file=corpus_file,
            epochs=model.epochs,
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words)
print(model)

# saving a model trained via Gensim's fastText implementation
model.save(save_file, separately=[])

# run some basic tests on the model
print("job" in model.wv.vocab)
print("salary" in model.wv.vocab)
print("learn" in model.wv.vocab)

# print vector representaion
print(model["job"])

# test similarity
print(model.similarity("job", "salary"))
print(model.similarity("job", "learn"))
print(model.similarity("job", "the"))
Esempio n. 2
0
###############################################################################
#
# Similarity operations work the same way as word2vec. **Out-of-vocabulary words can also be used, provided they have at least one character ngram present in the training custom.**
#


print("nights" in model.wv.vocab)

###############################################################################
#
print("night" in model.wv.vocab)

###############################################################################
#
print(model.similarity("night", "nights"))

###############################################################################
#
# Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided `here <Word2Vec_FastText_Comparison.ipynb>`_.
#


###############################################################################
#
# Other similarity operations
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only
print(model.most_similar("nights"))
# In[ ]:

from gensim.models.fasttext import FastText

ft_model = FastText(train_data,
                    size=embedding_size,
                    window=window_size,
                    min_count=min_word,
                    sample=down_sampling,
                    sg=1,
                    iter=10)

# In[ ]:

semantically_similar_words = {
    words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)]
    for words in
    ['kitchen', 'death', 'king', 'queen', 'strong', 'weak', 'woman', 'man']
}

for k, v in semantically_similar_words.items():
    print(k + ":" + str(v))

# In[ ]:

ft_model.similarity("annabeth", "percy")

# In[ ]:

ft_model.wv.save_word2vec_format('FTvectors')