def test_class_invertedindex_build_inverted_index_and_dump_load_16(tmp_path): """ Проверка работы записи и перезаписи файла с инвертированным индексом :param tmp_path: создаю файл во временной директории, для записи тестируемого документа """ f1 = tmp_path / 'temp' test_index = [0, 1, 3] test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'}, {'test', 'two', 'cant'}] test_stop_words = {'hi', 'cant', 'lol'} test_result = inverted_index.build_inverted_index( indexs=test_index, stop_words=test_stop_words, words=test_words) test_result.rewriting(filepath=f1) # запись первого набора данных test_index2 = [15, 25] test_words2 = [{'one'}, {'two'}] test_result2 = inverted_index.build_inverted_index( indexs=test_index2, stop_words=test_stop_words, words=test_words2) test_result2.rewriting(filepath=f1) # запись второго набора данных test_inverted_index_load = inverted_index.InvertedIndex.load(filepath=f1) assert test_inverted_index_load.word_to_docs_mapping == { 'test': {0, 1, 3}, 'null': {0}, 'one': {1, 15}, 'two': {3, 25} }
def test_not_number_doc_where_lot_documents_26(creat_not_corect_data): indexs, words, stop_words = creat_not_corect_data test_inverted_index = inverted_index.build_inverted_index( indexs=indexs, words=words, stop_words=stop_words).word_to_docs_mapping etelon = { 'sit': {4}, 'window': {4}, 'make': {4}, 'program': {4}, 'sasha': {None}, 'porridge': {None}, 'little': {None}, 'was': {None}, 'delicious': {None}, 'third': {5}, 'test': {None, 5, 4}, 'about': {5}, 'something': {5}, 'watching': {None}, 'south': {None}, 'park': {None}, 'now': {None} } assert etelon == test_inverted_index
def test_build_inverted_index_working_right(tiny_dataset_documents): test_inverted_index = build_inverted_index(tiny_dataset_documents) expected_documents = { 'same': [123, 2], 'and': [123, 37], 'nothing': [123], 'words': [123, 2, 37], 'in': [2], 'this': [2], 'to': [5], 'be': [5], 'or': [5], 'not': [5], 'all': [37], 'such': [37], 'as': [37], 'are': [37], 'here': [37], 'A_word': [123, 37], 'B_word': [2, 37], 'dataset': [2], 'famous_phrases': [5], } assert test_inverted_index.term2doc == expected_documents, ( "build_inverted_index build incorrect" )
def creat_inverted_index(creat_index_and_words_with_temp_file, creat_stop_words_wher_load_stop_words_with_fiel): indexs, words = creat_index_and_words_with_temp_file stop_words = creat_stop_words_wher_load_stop_words_with_fiel return inverted_index.build_inverted_index(indexs=indexs, words=words, stop_words=stop_words)
def test_all_22(tmp_path, tmpdir): """ Общий тест функционала: обратока документов, создание инверт индеса, запись его и считывание. Поиск слова :param tmp_path: временная диретория :param tmpdir: добавление временого файла """ test_doc = tmpdir.join( 'datatest.txt') # создаю временный тестовый документ test_doc.write( '0\tTest text! test number one...\n1\tTest text... number two!\n' '2\tKent!!!! red gay\n3\tBoys len lan, two\n12\tTrest!!! best wreit!') test_doc2 = tmpdir.join('stop_words.txt') test_doc2.write('Test\ntest\nnumber\nTe\ntext\nnumber2') test_doc3 = tmpdir.join('inverted.index') indexs, words = inverted_index.load_documents(test_doc) stop_words = inverted_index.load_stop_words(test_doc2) test_inverted_index2 = inverted_index.build_inverted_index( indexs=indexs, words=words, stop_words=stop_words) test_inverted_index2.dump(test_doc3) # json записывается на диск test_inverted_index_load = inverted_index.InvertedIndex.load(test_doc3) document_ids = test_inverted_index_load.query(["two"]) etalon = {1, 3} assert etalon == document_ids
def test_not_number_doc_where_lot_documents_with_two_doc_have_not_number_27( creat_not_corect_data, tmp_path): tile_path = tmp_path / 'tmp' indexs, words, stop_words = creat_not_corect_data test_inverted_index = inverted_index.build_inverted_index( indexs=indexs, words=words, stop_words=stop_words) test_inverted_index.dump(filepath=tile_path) load_test_inverted_index = inverted_index.InvertedIndex.load(tile_path) etelon = { 'sit': {4}, 'window': {4}, 'make': {4}, 'program': {4}, 'sasha': {None}, 'porridge': {None}, 'little': {None}, 'was': {None}, 'delicious': {None}, 'third': {5}, 'test': {None, 5, 4}, 'about': {5}, 'something': {5}, 'watching': {None}, 'south': {None}, 'park': {None}, 'now': {None} } assert etelon == load_test_inverted_index.word_to_docs_mapping
def test_build_inverted_one_doc_have_doc_with_dont_new_index_30( create_not_corect_data_with_two_doc_have_one_ndex): test_file = create_not_corect_data_with_two_doc_have_one_ndex indexs, words = inverted_index.load_documents(test_file) stop_words = {'i', 'a', 'am', 'is', 'by', 'and', 'the'} test_inverted_idex = inverted_index.build_inverted_index( stop_words=stop_words, indexs=indexs, words=words) etalan = { 'test': {4, 5, 8, 6}, 'sit': {4}, 'window': {4}, 'make': {4}, 'program': {4}, 'third': {5}, 'about': {5}, 'something': {5}, 'number': {8}, 'tree': {8}, 'now': {6}, 'south': {6}, 'watching': {6}, 'park': {6} } assert etalan == test_inverted_idex.word_to_docs_mapping
def creat_test_file_for_load_with_him(): index = [0, 1] words = [{'words', 'name', 'test'}, {'words', 'green', 'man'}] test_inverted_index = inverted_index.build_inverted_index(indexs=index, words=words, stop_words={}) return test_inverted_index
def fixture_inverted_index(tmp_path): test_index = [0, 1, 3] test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'}, {'test', 'two', 'cant'}] test_stop_words = {'hi', 'cant', 'lol'} test_result = inverted_index.build_inverted_index( indexs=test_index, stop_words=test_stop_words, words=test_words) return test_result
def test_dump_and_load_index(tmp_path, tiny_sample_document): dir = tmp_path / "tiny_example_dir" dir.mkdir() index_file = dir / "tiny_example.index" docs = tiny_sample_document inv_table = build_inverted_index(docs) inv_table.dump(index_file) assert inv_table == TINY_SAMPLE_INV_TABLE loaded_inv_table = InvertedIndex.load(index_file) assert inv_table == loaded_inv_table
def test_inv_index_query(load_inverted_index, wiki_docs): words = ['after', 'were'] doc_ind = InvertedIndex.query(load_inverted_index, words) assert {25, 290}.issubset(doc_ind) words = ['neizvesnie', 'slova'] doc_ind = InvertedIndex.query(load_inverted_index, words) assert len(doc_ind) == 0 words = ['after', 'were'] index_dict = build_inverted_index(wiki_docs) inv_index = InvertedIndex(index_dict) doc_ind = inv_index.query(words) assert {25, 290}.issubset(doc_ind)
def test_class_invertedindex_query_with_two_words_19(): """ Поиск двух одинаовых слов в методе InvertedIndex.query """ test_index = [0, 1, 3] test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'}, {'test', 'two', 'cant'}] test_stop_words = {'hi', 'cant', 'lol'} test_result = inverted_index.build_inverted_index( indexs=test_index, stop_words=test_stop_words, words=test_words) test_query = test_result.query(['two', 'two']) etalon = {3} assert test_query == etalon
def test_class_invertedindex_query_18(): """ Как производится посик слова InvertedIndex.query и вывод результата """ test_index = [0, 1, 3] test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'}, {'test', 'two', 'cant'}] test_stop_words = {'hi', 'cant', 'lol'} test_result = inverted_index.build_inverted_index( indexs=test_index, stop_words=test_stop_words, words=test_words) test_query = test_result.query(['two', 'test']) etalon = {3} assert test_query == etalon
def test_query_not_number_doc_where_lot_documents_with_two_doc_have_not_number_28( creat_not_corect_data, tmp_path): tile_path = tmp_path / 'tmp' indexs, words, stop_words = creat_not_corect_data test_inverted_index = inverted_index.build_inverted_index( indexs=indexs, words=words, stop_words=stop_words) test_inverted_index.dump(filepath=tile_path) load_test_inverted_index = inverted_index.InvertedIndex.load(tile_path) result_query = load_test_inverted_index.query(['south']) assert result_query == {None}
def test_query_inverted_index_with_query_file_utf_8(): documents = load_documents(TINY_DATASET_FPATH) tiny_inverted_index = build_inverted_index(documents) tiny_inverted_index.dump_binary(TINY_INVERTED_INDEX_STORE_PATH) count = 1 with open(QUERY_FILE_UTF8_FPATH) as q_file: for line in q_file: line = line.split() answer = tiny_inverted_index.query(line) if count == 1: etalon_answer = [12, 25] else: etalon_answer = [25] assert sorted(answer) == sorted(etalon_answer), ( f"Expected answer is {etalon_answer},but you got {answer}") count += 1
def test_build_inverted_index11(): """ Проферка работы функции build_inverted_index на коректных данных """ test_index = [0, 1, 3] test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'}, {'test', 'two', 'cant'}] test_stop_words = {'hi', 'cant', 'lol'} test_result = inverted_index.build_inverted_index( indexs=test_index, stop_words=test_stop_words, words=test_words) assert test_result.word_to_docs_mapping == { 'test': {0, 1, 3}, 'null': {0}, 'one': {1}, 'two': {3} }
def build_inverted_index_for_creat_data_not_corect(tmpdir): test_doc = tmpdir.join('wiki_doc') test_doc.write( '\tName Shasha train this program and work with data like!\n' '4\tTest name number two and test, i like programming!') test_doc_stop_words = tmpdir.join('stop_words.txt') test_doc_stop_words.write('and\ni\n') result_load_doc = inverted_index.load_documents(filepath=test_doc) result_load_stop_words = inverted_index.load_stop_words( filepath=test_doc_stop_words) result_inverted_index_build_inverted_index = inverted_index.build_inverted_index( stop_words=result_load_stop_words, words=result_load_doc[1], indexs=result_load_doc[0]) return result_inverted_index_build_inverted_index
def test_can_build_inverted_index( self, creat_index_and_words_with_temp_file, creat_stop_words_wher_load_stop_words_with_fiel): index, words = creat_index_and_words_with_temp_file tinverted_index = inverted_index.build_inverted_index( indexs=index, words=words, stop_words=creat_stop_words_wher_load_stop_words_with_fiel) assert tinverted_index.word_to_docs_mapping == { 'believe': {0}, 'tears': {0}, 'wind': {1}, 'making': {1}, 'noise': {1}, 'head': {1}, 'walking': {2}, 'with': {2}, 'spring': {2} }
def test_class_invertedindex_and_build_inverted_index_15(tmp_path): """ Как работает build_inverted_index, запись в файл InvertedIndex.dump и считывание InvertedIndex.load :param tmp_path: создаю файл во временной директории, для записи тестируемого документа """ f1 = tmp_path / 'temp' # временная директория test_index = [0, 1, 3] # полученые номера документов test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'}, {'test', 'two', 'cant'}] # слова документов test_stop_words = {'hi', 'cant', 'lol'} # строю инвертированный индекс test_result = inverted_index.build_inverted_index( indexs=test_index, stop_words=test_stop_words, words=test_words) test_result.dump( filepath=f1) # записываю инвертированный индекс в временную директорию test_inverted_index_load = inverted_index.InvertedIndex.load( filepath=f1) # считываю инверт индекс result = test_inverted_index_load.word_to_docs_mapping assert result == {'test': {0, 1, 3}, 'null': {0}, 'one': {1}, 'two': {3}}
def test_build_inverted_index_not_core_data_12(): """ Как отработает build_inverted_index на данные с отстуствием индекса :return: """ test_indexs = [None] test_words = [{'name', 'begin', 'where', 'test', 'work', 'he', 'i'}] test_stop_words = {'he', 'i'} test_inverted_index = inverted_index.build_inverted_index( indexs=test_indexs, words=test_words, stop_words=test_stop_words) etalon = { 'name': {None}, 'begin': {None}, 'where': {None}, 'test': {None}, 'work': {None} } assert test_inverted_index.word_to_docs_mapping == etalon
def test_number_and_no_words_in_document_28(creat_doc_have_not_words): indexs, words = creat_doc_have_not_words stop_words = {'i', 'a', 'am', 'is', 'by', 'and', 'the'} test_inverted_index = inverted_index.build_inverted_index( indexs=indexs, words=words, stop_words=stop_words) etalon = { 'sit': {4}, 'window': {4}, 'make': {4}, 'program': {4}, 'test': {4, 5, 6}, 'third': {5}, 'about': {5}, 'something': {5}, 'watching': {6}, 'south': {6}, 'park': {6}, 'now': {6} } assert etalon == test_inverted_index.word_to_docs_mapping
def test_can_query(tmpdir): dataset_fio = tmpdir.join("dataset.txt") dataset_fio.write( dedent("""\ 1\thappy cat wow 2\thappy cat good 3\tgood cat audi 4\t audi and bmw """)) documents = load_documents(dataset_fio) inverted_index = build_inverted_index(documents) document_ids = inverted_index.query(["happy", "good"]) assert document_ids == [2] assert inverted_index.query(["happy", "good", "cat"]) == [2] assert inverted_index.query(["cat"]) == [1, 2, 3] assert inverted_index.query(["cat", "audi"]) == [3] assert inverted_index.query(["cat", "audi", 'cat']) == [3] assert inverted_index.query(["cat", "audi", 'audi']) == [3] assert inverted_index.query(["audi", 'bmw']) == [4] assert inverted_index.query(["audi", 'bmw', 'cat']) == list()
def test_binary_dump_and_load_index(tmp_path, tiny_sample_document, words=['of', 'words']): dir_ = tmp_path / "tiny_example_dir" dir_.mkdir() index_file = dir_ / "tiny_example.bin.index" docs = tiny_sample_document inv_table = build_inverted_index(docs) inv_table.dump(index_file, storage_policy='binary') assert inv_table == TINY_SAMPLE_INV_TABLE loaded_inv_table = InvertedIndex.load(index_file, storage_policy='binary') assert inv_table == loaded_inv_table # test query callback Args = namedtuple('Args', ['index_path', 'words']) args = Args(index_path=index_file, words=words) response = query_callback(args) ethalon_response = [ {14, 1000}, ] assert response == ethalon_response
def test_index_creation(): docs = load_documents(TINY_SAMPLE_FILEPATH) inv_idx = build_inverted_index(docs) assert TINY_SAMPLE_INV_TABLE == inv_idx assert repr(TINY_SAMPLE_WORD_DICT) == repr(inv_idx)
def test_invert_table_eq(tiny_sample_document): docs = tiny_sample_document inv_table1 = build_inverted_index(docs) inv_table2 = build_inverted_index(docs) assert inv_table1 == inv_table2
# -*- coding: latin-1 -*- ''' Created on 7/02/2013 @author: 74187593 ''' from inverted_index import build_inverted_index from boolean_query import intersect_several, union if __name__ == '__main__': my_path = 'C:\\temp\\benedetti' document_list = {} dictionary = build_inverted_index(my_path, document_list) for d in sorted(dictionary.keys()): print d + " : " + str(dictionary[d].frequency) + " : " + str(dictionary[d].postings) print len(dictionary) answer = intersect_several([dictionary[u"tu"], dictionary[u"por"], dictionary[u"te"]]) answer = union(dictionary[u"tu"], dictionary[u"por"]) for document_found in answer.postings: print document_list[document_found]
def test_build_inverted_index_do_not_raise_exception(): documents = [] build_inverted_index(documents)
def test_inverted_index_dump(wiki_docs): index_dict = build_inverted_index(wiki_docs) inv_index = InvertedIndex(index_dict) inv_index.dump('inv_index.dat') assert True
def test_build_inverted_index(wiki_docs): index_dict = build_inverted_index(wiki_docs) assert isinstance(index_dict, dict)
this is done over just putting query results in a shelf simply because the stored results are deleted after the browser is closed and thus are less permanent """ parser = argparse.ArgumentParser(description="Boolean IR system") parser.add_argument("--build") parser.add_argument("--run", action="store_true") parser.add_argument("--test", action="store_true") args = parser.parse_args() print(args) if args.test: wapo_path = data_dir.joinpath("test_corpus.jl") # print("TEST") if args.build: build_inverted_index(wapo_path, str(data_dir.joinpath( args.build))) # shelve.open cannot recognize Path shelve_wapo(wapo_path, shelf_path) if args.run: # Use context managers for safe open and close with shelve.open(shelf_path, flag='r') as wapo: # Name full inverted index FULL with shelve.open(str(data_dir.joinpath("FULL")), flag='r') as index: with shelve.open(str(data_dir.joinpath("QUERY")), flag='n') as query: WAPO_SHELF = wapo INDEX_SHELF = index QUERY_SHELF = query app.run(debug=True, port=5000)
def get_inverted_index(): documents = load_documents(DATASET_SMALL_FPATH) inverted_index = build_inverted_index(documents) return inverted_index