def test_word2vec(): sentences = [ "This is an awesome book to learn NLP.", "DistilBERT is an amazing NLP model.", "We can interchangeably use embedding, encoding, or vectorizing.", ] splitter = Splitter() splitter.sent2words(sentences, add_stop_words=['distilbert', 'vectorizing']) vectorizer = Vectorizer() vectorizer.word2vec(splitter.words, pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI) dist_1 = spatial.distance.cosine(vectorizer.vectors[0], vectorizer.vectors[1]) dist_2 = spatial.distance.cosine(vectorizer.vectors[0], vectorizer.vectors[2]) assert dist_1 < dist_2
def test_complete(): sentences = [ "Alice is in the Wonderland.", "Alice is not in the Wonderland.", ] vectorizer = Vectorizer() vectorizer.bert(sentences) vectors_bert = vectorizer.vectors dist_bert = spatial.distance.cosine(vectors_bert[0], vectors_bert[1]) splitter = Splitter() splitter.sent2words(sentences=sentences, remove_stop_words=['not'], add_stop_words=[]) vectorizer.word2vec(splitter.words, pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI) vectors_w2v = vectorizer.vectors dist_w2v = spatial.distance.cosine(vectors_w2v[0], vectors_w2v[1]) print('dist_bert: {0}, dist_w2v: {1}'.format(dist_bert, dist_w2v)) assert dist_w2v > dist_bert
def test_models(): sentences = [ "'Artificial Intelligence: Unorthodox Lessons' is an amazing book to gain insights about AI." ] splitter = Splitter() splitter.sent2words(sentences=sentences) vectorizer = Vectorizer() vectorizer.word2vec(splitter.words, pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI) vectors_wiki = vectorizer.vectors sentences = [ "'Artificial Intelligence: Unorthodox Lessons' is an amazing book to gain insights about AI." ] splitter = Splitter() splitter.sent2words(sentences=sentences) vectorizer = Vectorizer() vectorizer.word2vec(splitter.words, pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI) vectors_fasttext = vectorizer.vectors dist = spatial.distance.cosine(vectors_wiki, vectors_fasttext) print(dist)