Exemple #1
0
def test_bert_04():
    sentences = ["401k retirement accounts", "401k retirement accounts"]
    vectorizer = Vectorizer()
    vectorizer.run(sentences)
    dist = spatial.distance.cosine(vectorizer.vectors[0],
                                   vectorizer.vectors[1])
    assert dist == 0
Exemple #2
0
def test_bert_01():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding or encoding, or vectorizing.",
    ]
    vectorizer = Vectorizer()
    vectorizer.run(sentences)
    assert len(vectorizer.vectors[0]) == 768
Exemple #3
0
def test_bert_03():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding, encoding, or vectorizing.",
    ]
    vectorizer = Vectorizer(pretrained_weights="bert-base-multilingual-cased")
    vectorizer.run(sentences)
    dist_1 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[1])
    dist_2 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[2])
    print('dist_1: {0}, dist_2: {1}'.format(dist_1, dist_2))
    assert dist_1 < dist_2
Exemple #4
0
def test_word2vec_02():
    sentences = [
        "Alice is in the Wonderland.",
        "Alice is not in the Wonderland.",
    ]

    vectorizer = Vectorizer(pretrained_weights=PRETRAINED_VECTORS_PATH_WIKI)
    vectorizer.run(sentences)
    vectorizer.run(sentences, remove_stop_words=[])

    dist_1 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[1])
    dist_2 = spatial.distance.cosine(vectorizer.vectors[2],
                                     vectorizer.vectors[3])
    assert (dist_1 != dist_2) & (dist_2 == 0)
Exemple #5
0
def test_word2vec_01():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding, encoding, or vectorizing.",
    ]

    vectorizer = Vectorizer(pretrained_weights=PRETRAINED_VECTORS_PATH_WIKI)
    vectorizer.run(sentences, add_stop_words=['distilbert', 'vectorizing'])

    dist_1 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[1])
    dist_2 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[2])
    assert dist_1 < dist_2
Exemple #6
0
def test_bert_05():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding, encoding, or vectorizing.",
    ]
    new_sentences = [
        "这是一本学习 NLP 的好书",
        "DistilBERT 是一个了不起的 NLP 模型",
        "我们可以交替使用嵌入、编码或矢量化。",
    ]
    vectorizer = Vectorizer(pretrained_weights="bert-base-multilingual-cased")
    vectorizer.run(sentences)
    vectorizer.run(new_sentences)
    vectors = vectorizer.vectors
    assert len(vectors) == 6
Exemple #7
0
def test_complete():
    sentences = [
        "Alice is in the Wonderland.",
        "Alice is not in the Wonderland.",
    ]

    vectorizer = Vectorizer()
    vectorizer.run(sentences)
    vectors_bert = vectorizer.vectors
    dist_bert = spatial.distance.cosine(vectors_bert[0], vectors_bert[1])

    vectorizer = Vectorizer(pretrained_weights=PRETRAINED_VECTORS_PATH_WIKI)
    vectorizer.run(sentences)
    vectors_w2v = vectorizer.vectors
    dist_w2v = spatial.distance.cosine(vectors_w2v[0], vectors_w2v[1])

    print('dist_bert: {0}, dist_w2v: {1}'.format(dist_bert, dist_w2v))
    assert dist_w2v > dist_bert