Ejemplo n.º 1
0
def test_build_inverted_one_doc_have_doc_with_dont_new_index_30(
        create_not_corect_data_with_two_doc_have_one_ndex):
    test_file = create_not_corect_data_with_two_doc_have_one_ndex
    indexs, words = inverted_index.load_documents(test_file)
    stop_words = {'i', 'a', 'am', 'is', 'by', 'and', 'the'}

    test_inverted_idex = inverted_index.build_inverted_index(
        stop_words=stop_words, indexs=indexs, words=words)
    etalan = {
        'test': {4, 5, 8, 6},
        'sit': {4},
        'window': {4},
        'make': {4},
        'program': {4},
        'third': {5},
        'about': {5},
        'something': {5},
        'number': {8},
        'tree': {8},
        'now': {6},
        'south': {6},
        'watching': {6},
        'park': {6}
    }
    assert etalan == test_inverted_idex.word_to_docs_mapping
Ejemplo n.º 2
0
def test_all_22(tmp_path, tmpdir):
    """
        Общий тест функционала: обратока документов, создание инверт индеса, запись его и считывание. Поиск слова
        :param tmp_path: временная диретория
        :param tmpdir: добавление временого файла
    """
    test_doc = tmpdir.join(
        'datatest.txt')  # создаю временный тестовый документ
    test_doc.write(
        '0\tTest text! test number one...\n1\tTest text... number two!\n'
        '2\tKent!!!! red gay\n3\tBoys len lan, two\n12\tTrest!!! best wreit!')
    test_doc2 = tmpdir.join('stop_words.txt')
    test_doc2.write('Test\ntest\nnumber\nTe\ntext\nnumber2')

    test_doc3 = tmpdir.join('inverted.index')

    indexs, words = inverted_index.load_documents(test_doc)
    stop_words = inverted_index.load_stop_words(test_doc2)

    test_inverted_index2 = inverted_index.build_inverted_index(
        indexs=indexs, words=words, stop_words=stop_words)
    test_inverted_index2.dump(test_doc3)  # json записывается на диск

    test_inverted_index_load = inverted_index.InvertedIndex.load(test_doc3)
    document_ids = test_inverted_index_load.query(["two"])
    etalon = {1, 3}
    assert etalon == document_ids
def test_can_load_documents():
    documents = load_documents(TINY_DATASET_FPATH)
    etalon_documents = {
        "12": "another sentense four two one one three.\n",
        "25": "one two three four words.\n"
    }
    assert etalon_documents == documents, "load_documents incorrectly load dataset"
def test_read_docs_sample_v1():
    loaded_docs = load_documents(SMALL_SAMPLE_FILEPATH)
    res = {
        1: 'Article 1   Some text to test inverted index',
        2: 'Article 2   Another paragraph with no common words with first one',
        17: 'Article 3   Sample text similar to first article for test',
        5: "АФЫФЫё фывфапфва фывтлавы фывтлфы ΔG ‡"
    }
    assert res == loaded_docs
Ejemplo n.º 5
0
def creat_doc_have_not_words(tmpdir):
    not_corect_data = '4\tI sit by the window and make a program test\n' \
                      '8\t\n' \
                      '5\tThe third test is about something\n' \
                      '6\tI am watching south park now, test'

    test_file = tmpdir.join('wiki_file')
    test_file.write(not_corect_data)

    return inverted_index.load_documents(test_file)
def test_read_docs_sample_v2(tmpdir):
    dataset_str = dedent("""\
    14	BOW Bag of words
    1000	CBOW Continius bag of words
    """)
    dataset_fio = tmpdir.join("light.dataset")
    dataset_fio.write(dataset_str)
    docs = load_documents(dataset_fio)
    etalon_docs = {14: "BOW Bag of words", 1000: "CBOW Continius bag of words"}
    assert docs == etalon_docs
Ejemplo n.º 7
0
 def test_can_open_file_in_load_documents(self,
                                          creat_data_file_wiki_sample):
     index, words = inverted_index.load_documents(
         filepath=creat_data_file_wiki_sample)
     assert index == [0, 1, 2]
     assert words == [{'believe', 'in', 'tears'},
                      {
                          'the', 'wind', 'is', 'making', 'noise', 'in',
                          'my', 'head'
                      }, {'walking', 'with', 'spring'}]
def test_can_load_documents(tiny_dataset_fio):
    documents = load_documents(tiny_dataset_fio)
    expected_documents = {
        123: "same words A_word and nothing",
        2: "same words B_word in this dataset",
        5: "famous_phrases to be or not to be",
        37: "all words such as A_word and B_word are here",
    }
    assert expected_documents == documents, (
        "load_documents work incorrectly"
    )
Ejemplo n.º 9
0
def test_number_and_no_words_in_document_29(
        create_not_corect_data_with_two_doc_have_one_ndex):
    test_file = create_not_corect_data_with_two_doc_have_one_ndex
    indexs, words = inverted_index.load_documents(test_file)
    etalon_indexs = [4, 8, 5, 6, 8]
    etalon_words = [{
        'i', 'sit', 'by', 'the', 'window', 'and', 'make', 'a', 'program',
        'test'
    }, None, {'the', 'third', 'test', 'is', 'about', 'something'},
                    {'i', 'am', 'watching', 'south', 'park', 'now', 'test'},
                    {'test', 'number', 'tree'}]
    assert etalon_words == words
    assert etalon_indexs == indexs
Ejemplo n.º 10
0
def test_work_function_load_documents_with_lot_doc_5(tmpdir):
    """
        Как отрабатывает функция при работе с двумя нормальным документом в файле,  для записи тестируемого документа
        :param tmpdir: создаю файл во временной директории, для записи тестируемого документа
    """
    test_doc = tmpdir.join('datatest.txt')
    test_doc.write(
        '0\tTest text! test number one...\n1\tTest text... number two!')
    test_index, test_words = inverted_index.load_documents(test_doc)
    etalon_index = [0, 1]
    etalon_words = [{'test', 'text', 'number', 'one'},
                    {'test', 'number', 'two', 'text'}]
    assert test_words == etalon_words
    assert test_index == etalon_index
Ejemplo n.º 11
0
def creat_not_corect_data(tmpdir):
    not_corect_data = '4\tI sit by the window and make a program test\n' \
                      '\tSasha ate porridge, a little, but it was delicious test\n' \
                      '5\tThe third test is about something\n' \
                      '\tI am watching south park now, test'
    test_doc = tmpdir.join('wiki_file')
    test_doc.write(not_corect_data)
    stop_words = 'the\ni\na\nate\nby\nand\nbut\nit\nis\nam\n'

    test_stop_words = tmpdir.join('stop_words')
    test_stop_words.write(stop_words)

    result_stop_words = inverted_index.load_stop_words(test_stop_words)
    result_index, result_words = inverted_index.load_documents(test_doc)
    return result_index, result_words, result_stop_words
Ejemplo n.º 12
0
def test_doc_do_not_contaon_index_25(tmpdir):
    """
        Тест для отработки ситуации если документ не содержит индеса
        :param tmpdir: временная директория для тестового файла
    """
    test_doc = tmpdir.join('test_wiki_doc.txt')
    test_doc.write(
        '\tName Doc this test doc. I will use this text for the test.')
    test_indexs, test_words = inverted_index.load_documents(test_doc)
    etalon_index = [None]  # вместо индеса должна прийти пустая список
    # создается множество слов
    etalon_words = [{
        'name', 'doc', 'this', 'test', 'i', 'will', 'use', 'text', 'for', 'the'
    }]
    assert test_indexs == etalon_index
    assert etalon_words == test_words
Ejemplo n.º 13
0
def test_work_function_load_documents_with_an_extra_newline_character_6(
        tmpdir):
    """
        Как отработает функция если в файле будет один документ, который содержит в конце символ "\n"
        :param tmpdir: создаю файл во временной директории, для записи тестируемого документа
    """
    test_doc = tmpdir.join(
        'datatest.txt')  # создаю временный тестовый документ
    test_doc.write(
        '0\tTest text! test number one...\n1\tTest text... number two!\n')
    test_index, test_words = inverted_index.load_documents(test_doc)
    etalon_index = [0, 1]
    etalon_words = [{'test', 'text', 'number', 'one'},
                    {'test', 'number', 'two', 'text'}]
    assert test_index == etalon_index
    assert test_words == etalon_words
def test_query_inverted_index_with_query_file_utf_8():
    documents = load_documents(TINY_DATASET_FPATH)
    tiny_inverted_index = build_inverted_index(documents)
    tiny_inverted_index.dump_binary(TINY_INVERTED_INDEX_STORE_PATH)
    count = 1
    with open(QUERY_FILE_UTF8_FPATH) as q_file:
        for line in q_file:
            line = line.split()
            answer = tiny_inverted_index.query(line)
            if count == 1:
                etalon_answer = [12, 25]
            else:
                etalon_answer = [25]
            assert sorted(answer) == sorted(etalon_answer), (
                f"Expected answer is {etalon_answer},but you got {answer}")
            count += 1
Ejemplo n.º 15
0
def test_can_load_documents(tmpdir):
    dataset_fio = tmpdir.join("dataset.txt")
    dataset_fio.write(
        dedent("""\
        1\thappy cat
        2\thappy cat good
        3\tgood cat
    """))
    documents = load_documents(dataset_fio)
    etalon_documents = {
        1: "happy cat",
        2: "happy cat good",
        3: "good cat",
    }
    assert etalon_documents == documents, (
        "load_documents incorrectly loaded dataset")
Ejemplo n.º 16
0
def test_work_function_load_documents_with_lot_doc_7(tmpdir):
    """
        Проверка работы функции load_documents при большом числе документов записанных в файл
        :param tmpdir: создаю файл во временной директории, для записи тестируемого документа
    """
    test_doc = tmpdir.join(
        'datatest.txt')  # создаю временный тестовый документ
    test_doc.write(
        '0\tTest text! test number one...\n1\tTest text... number two!\n'
        '2\tKent!!!! red gay\n3\tBoys len lan\n12\tTrest!!! best wreit!')
    test_index, test_words = inverted_index.load_documents(test_doc)
    etalon_index = [0, 1, 2, 3, 12]
    etalon_words = [{'test', 'text', 'number', 'one'},
                    {'test', 'number', 'two', 'text'}, {'kent', 'red', 'gay'},
                    {'boys', 'len', 'lan'}, {'trest', 'best', 'wreit'}]
    assert test_index == etalon_index
    assert test_words == etalon_words
Ejemplo n.º 17
0
def build_inverted_index_for_creat_data_not_corect(tmpdir):
    test_doc = tmpdir.join('wiki_doc')
    test_doc.write(
        '\tName Shasha train this program and work with data like!\n'
        '4\tTest name number two and test, i like programming!')

    test_doc_stop_words = tmpdir.join('stop_words.txt')
    test_doc_stop_words.write('and\ni\n')

    result_load_doc = inverted_index.load_documents(filepath=test_doc)
    result_load_stop_words = inverted_index.load_stop_words(
        filepath=test_doc_stop_words)

    result_inverted_index_build_inverted_index = inverted_index.build_inverted_index(
        stop_words=result_load_stop_words,
        words=result_load_doc[1],
        indexs=result_load_doc[0])
    return result_inverted_index_build_inverted_index
Ejemplo n.º 18
0
def test_work_function_load_documents_with_one_doc_4(tmpdir):
    """
        Как отрабатывает функция при работе с одним нормальным документом в файле,  для записи тестируемого документа
        :param tmpdir: создаю файл во временной директории
    """
    test_doc = tmpdir.join(
        'datatest.txt')  # создаю файл во временной директории
    # временные тестовые данные
    test_doc.write(
        '0\tTest text testing, my work now! Train testing this, working, bool?'
    )
    test_index, test_words = inverted_index.load_documents(test_doc)
    etalon_index = [0]  # Ожидается что будет один индекс документа
    etalon_words = [{
        'test', 'text', 'testing', 'my', 'work', 'now', 'train', 'this',
        'working', 'bool'
    }]
    assert etalon_index == test_index
    assert etalon_words == test_words
Ejemplo n.º 19
0
def test_can_query(tmpdir):
    dataset_fio = tmpdir.join("dataset.txt")
    dataset_fio.write(
        dedent("""\
        1\thappy cat wow
        2\thappy cat good
        3\tgood cat audi
        4\t audi and bmw
        """))
    documents = load_documents(dataset_fio)
    inverted_index = build_inverted_index(documents)
    document_ids = inverted_index.query(["happy", "good"])
    assert document_ids == [2]
    assert inverted_index.query(["happy", "good", "cat"]) == [2]
    assert inverted_index.query(["cat"]) == [1, 2, 3]
    assert inverted_index.query(["cat", "audi"]) == [3]
    assert inverted_index.query(["cat", "audi", 'cat']) == [3]
    assert inverted_index.query(["cat", "audi", 'audi']) == [3]
    assert inverted_index.query(["audi", 'bmw']) == [4]
    assert inverted_index.query(["audi", 'bmw', 'cat']) == list()
def test_index_creation():
    docs = load_documents(TINY_SAMPLE_FILEPATH)
    inv_idx = build_inverted_index(docs)
    assert TINY_SAMPLE_INV_TABLE == inv_idx
    assert repr(TINY_SAMPLE_WORD_DICT) == repr(inv_idx)
Ejemplo n.º 21
0
def test_load_documents():
    test_file_name = prepare_file()
    docs = inverted_index.load_documents(test_file_name)
    assert docs
    os.remove(test_file_name)
def test_not_existing_file():
    with pytest.raises(FileNotFoundError):
        check_filepath_existance(NOT_REAL_FILENAME)
    check_filepath_existance(SMALL_SAMPLE_FILEPATH)
    with pytest.raises(FileNotFoundError):
        load_documents(NOT_REAL_FILENAME)
def test_can_load_wikipedia_sample():
    documents = load_documents(WIKIPEDIA_DATASET_FPATH)
    assert len(documents) == 4100, "you incorrectly loaded Wikipedia sample"
def tiny_sample_document():
    tiny_documents = load_documents(TINY_SAMPLE_FILEPATH)
    return tiny_documents
def test_can_load_right_len_document(filepath, document_len):
    documents = load_documents(filepath)
    assert len(documents) == document_len, (
        "load_documents work load incorrect len"
    )
Ejemplo n.º 26
0
def wiki_docs():
    return load_documents('small_wiki_sample')
Ejemplo n.º 27
0
def test_load_documents_exception():
    with pytest.raises(FileNotFoundError) as fnfe:
        articles = load_documents('wikipedia_sample/ikipedia_sample')
def get_inverted_index():
    documents = load_documents(DATASET_SMALL_FPATH)
    inverted_index = build_inverted_index(documents)
    return inverted_index
def small_dataset_documents():
    documents = load_documents(DATASET_SMALL_FPATH)
    return documents
def tiny_dataset_documents():
    documents = load_documents(DATASET_TINY_FPATH)
    return documents