def part3_3(orginal_trie: dictionary.Trie): if orginal_trie is None: orginal_trie, _ = dictionary.build_dictionary() ### compress and save part a0, a1, a2 = compress_gamma(orginal_trie) save_bitstring(a1, "a1.txt") save_bitstring(a2, "a2.txt") f = open("a0.pkl", "wb") pickle.dump(a0, f) f.close() print("Posting Lists Compressed and Saved in a0.pkl, a1.txt, a2.txt") ### load and decompress part a1 = load_bitsring("a1.txt") a2 = load_bitsring("a2.txt") f = open("a0.pkl", "rb") a0 = pickle.load(f) f.close() recovered_trie = decompress_gamma(a0, a1, a2) print("Posting Lists Compressed and Loaded and Decompressed") if check_equality_tri(orginal_trie, recovered_trie): print("Orginal Version and Recovered Version Are Identical") else: print("Orginal Version and Recovered Version Are Not Identical!!!")
def prepare_data_as_list(): filename = "Data.csv" trie_dict, _ = dictionary.build_dictionary(english_or_persian=LANGUAGE, filename=filename) trie_list = trie_dict.to_list() trie_list.sort(key=lambda x: x.term) term_and_posting_list = [(a.term, a.posting_list) for a in trie_list] return term_and_posting_list
def prepare_data_as_list(train_not_test=True): if train_not_test: filename = "phase2_train.csv" else: filename = "phase2_test.csv" trie_dict, _ = dictionary.build_dictionary(english_or_persian=LANGUAGE, filename=filename) trie_list = trie_dict.to_list() trie_list.sort(key=lambda x: x.term) term_and_posting_list = [(a.term, a.posting_list) for a in trie_list] return term_and_posting_list
def part3_1and2(trie_ds: dictionary.Trie): if trie_ds is None: trie_ds, _ = dictionary.build_dictionary() a0, a1, a2 = compress_VB(trie_ds) ByteSize_VB = len(a1) + len(a2) a0, a1, a2 = compress_gamma(trie_ds) ByteSize_gamma = int((a1.len + a2.len) / 8) ByteSize_initial = asizeof.asizeof(trie_ds) print("Engilish Corpus:", " Whole Initial ByteSize= ", ByteSize_initial, " VB ByteSize= ", ByteSize_VB, " gamma ByteSize= ", ByteSize_gamma, " term_to_ptr ByteSize = ", asizeof.asizeof(a0))
def main(): global term_to_num, idf_query, doc_space, bigram_ds, trie_dict input("press any button!") dic_is_built = False while True: lang = int(input("choose language:\n" "1- english\n" "2- persian")) if lang == 1: PREPROCESSOR = english_preprocessing all_docs_and_titles = file_handler.load_english_file() else: PREPROCESSOR = persian_preproccessing all_docs_and_titles = file_handler.load_persian_file() command: int = int( input("choose:\n" "1- pre process my text\n" "2- show frequent words\n" "3- go to dict\n")) if command == 1: text = (input("enter your text:\n")) print("**pre processed:", PREPROCESSOR.pre_process_text(text)) elif command == 2: rank = int(input("how many?")) tf_token_pairs = PREPROCESSOR.get_all_docs_tf_tokens() for pair in tf_token_pairs[:rank]: print(pair[0], ": ", pair[1]) elif command == 3: if not dic_is_built: print("building dict...") trie_dict, bigram_ds = dictionary.build_dictionary(lang) doc_space, idf_query, term_to_num = part5.vector_space_preprocess( trie_dict) dic_is_built = True command2 = int( input("1- show posting for my word\n" "2- show positions of word in all docs\n" "3- remove the doc\n" "4- correct query\n" "5- test compression \n" "6- test compression recoverability\n" "7- search query\n" "8- search query (proximity)")) if command2 == 1: word = input("enter the word:\n") dictionary.show_posting_list(word, trie_dict) elif command2 == 2: word = input("enter the word:\n") dictionary.show_positions_in_all_docs(word, trie_dict) elif command2 == 3: doc_id = int(input("enter doc id:")) dictionary.remove_doc(all_docs_and_titles, doc_id, trie_dict) elif command2 == 4: query = input("enter [bad] query:") corrected = "" for term in PREPROCESSOR.simple_tokenize_and_remove_junk( query): corrected = corrected + " " + Part4.word_correction( term, trie_dict, bigram_ds) print("corrected: ", corrected) elif command2 == 5: Part3.part3_1and2(trie_dict) elif command2 == 6: Part3.part3_3(trie_dict) elif command2 == 7: query = input("inout your query: ") corrected = "" for term in PREPROCESSOR.simple_tokenize_and_remove_junk( query): corrected = corrected + " " + Part4.word_correction( term, trie_dict, bigram_ds) query_tokens = PREPROCESSOR.simple_tokenize_and_remove_junk( corrected) relev_docs = part5.get_related_docId_list( query_tokens, idf_query, term_to_num, doc_space) print("results: ", relev_docs) elif command2 == 8: query = input("inout your query: ") corrected = "" for term in PREPROCESSOR.simple_tokenize_and_remove_junk( query): corrected = corrected + " " + Part4.word_correction( term, trie_dict, bigram_ds) query_tokens = PREPROCESSOR.simple_tokenize_and_remove_junk( corrected) relev_docs = part5.get_related_docId_list_proximity_version( query_tokens, idf_query, term_to_num, doc_space, trie_dict) print("results: ", relev_docs)
if not (term in intersection_map): intersection_map[term] = set() intersection_map[term].add(bigram) union_map[term].add(bigram) for term in list(union_map.keys()): term_bi_list = list(set(dictionary.Bigram.get_bis_of_term(term))) for bigram in term_bi_list: union_map[term].add(bigram) jaccard_score_list = [] for term in list(union_map.keys()): jaccard_score_list.append( (term, 1.0 * len(intersection_map[term]) / len(union_map[term]))) jaccard_score_list.sort(key=lambda tup: tup[1], reverse=True) jaccard_score_list = jaccard_score_list[0:KK] editDistance_score_list = [] for i in range(KK): term = jaccard_score_list[i][0] editDistance_score_list.append((term, edit_distance(wrong_word, term))) editDistance_score_list.sort(key=lambda tup: tup[1], reverse=False) return editDistance_score_list[0][0] trie_d, bigram = dictionary.build_dictionary() print("dic built") print(word_correction("build", trie_d, bigram))