Ejemplo n.º 1
0
def test_text2words_02():
    with open('dataset/ensemble_method.txt', 'r') as file:
        texts = file.read().replace('\n', '')

    splitter = Splitter()
    splitter.text2words(texts=texts)
    assert len(splitter.words) == 571
    assert splitter.words[0] == 'write'
Ejemplo n.º 2
0
def test_sent2words():
    sentences = [
        "Alice is in the Wonderland.",
        "Alice is not in the Wonderland.",
    ]
    splitter = Splitter()
    splitter.sent2words(sentences=sentences, remove_stop_words=['not'])
    assert splitter.words == [['alice', 'wonderland'],
                              ['alice', 'not', 'wonderland']]
Ejemplo n.º 3
0
def test_text2words_02():
    file_name = os.path.join(DATA_DIR, 'ensemble_method.txt')
    with open(file_name, 'r', encoding="utf8") as file:
        texts = file.read().replace('\n', '')

    splitter = Splitter()
    splitter.text2words(texts=texts)
    assert len(splitter.words) <= 571
    assert splitter.words[0] == 'write'
Ejemplo n.º 4
0
def test_text2sents():
    texts = [
        "This is an awesome book to learn NLP. DistilBERT is an amazing NLP model. We can interchangeably use "
        "embedding, encoding, or vectorizing."
    ]
    splitter = Splitter()
    splitter.text2sents(texts=texts)
    assert str(
        splitter.sentences[0]) == 'This is an awesome book to learn NLP.'
Ejemplo n.º 5
0
def test_word2vec():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding, encoding, or vectorizing.",
    ]
    splitter = Splitter()
    splitter.sent2words(sentences,
                        add_stop_words=['distilbert', 'vectorizing'])
    vectorizer = Vectorizer()
    vectorizer.word2vec(splitter.words,
                        pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI)
    dist_1 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[1])
    dist_2 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[2])
    assert dist_1 < dist_2
Ejemplo n.º 6
0
def test_complete():
    sentences = [
        "Alice is in the Wonderland.",
        "Alice is not in the Wonderland.",
    ]
    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    vectors_bert = vectorizer.vectors
    dist_bert = spatial.distance.cosine(vectors_bert[0], vectors_bert[1])

    splitter = Splitter()
    splitter.sent2words(sentences=sentences,
                        remove_stop_words=['not'],
                        add_stop_words=[])
    vectorizer.word2vec(splitter.words,
                        pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI)
    vectors_w2v = vectorizer.vectors
    dist_w2v = spatial.distance.cosine(vectors_w2v[0], vectors_w2v[1])

    print('dist_bert: {0}, dist_w2v: {1}'.format(dist_bert, dist_w2v))
    assert dist_w2v > dist_bert
Ejemplo n.º 7
0
def test_text2words_03():
    # TODO
    with open('dataset/negotiation_tips.txt', 'r') as file:
        texts = file.read().replace('\n', '')

    def cleanerizer(texts):
        text_1 = re.sub(r"[(\[].*?[)\]]", "", texts)
        text_2 = re.sub(r'-', r'', text_1)
        return text_2

    splitter = Splitter()
    splitter.text2words(texts=cleanerizer(texts))
    print(splitter.words)
    splitter.text2words(texts=texts)
    print(splitter.words)
Ejemplo n.º 8
0
def test_text2words_03():
    # TODO
    file_name = os.path.join(DATA_DIR, 'negotiation_tips.txt')
    with open(file_name, 'r') as file:
        texts = file.read().replace('\n', '')

    def cleanerizer(texts):
        text_1 = re.sub(r"[(\[].*?[)\]]", "", texts)
        text_2 = re.sub(r'-', r'', text_1)
        return text_2

    splitter = Splitter()
    splitter.text2words(texts=cleanerizer(texts))
    print(splitter.words)
    splitter.text2words(texts=texts)
    print(splitter.words)
Ejemplo n.º 9
0
def test_models():
    sentences = [
        "'Artificial Intelligence: Unorthodox Lessons' is an amazing book to gain insights about AI."
    ]
    splitter = Splitter()
    splitter.sent2words(sentences=sentences)
    vectorizer = Vectorizer()
    vectorizer.word2vec(splitter.words,
                        pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI)
    vectors_wiki = vectorizer.vectors

    sentences = [
        "'Artificial Intelligence: Unorthodox Lessons' is an amazing book to gain insights about AI."
    ]
    splitter = Splitter()
    splitter.sent2words(sentences=sentences)
    vectorizer = Vectorizer()
    vectorizer.word2vec(splitter.words,
                        pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI)
    vectors_fasttext = vectorizer.vectors

    dist = spatial.distance.cosine(vectors_wiki, vectors_fasttext)
    print(dist)
Ejemplo n.º 10
0
def test_text2words_01():
    texts = ["This is an awesome book to learn NLP. DistilBERT is an amazing NLP model. We can interchangeably use " \
             "embedding, encoding, or vectorizing."]
    splitter = Splitter()
    splitter.text2words(texts=texts[0])
    assert splitter.words[0] == 'awesome'