コード例 #1
0
def part3_3(orginal_trie: dictionary.Trie):
    if orginal_trie is None:
        orginal_trie, _ = dictionary.build_dictionary()

    ### compress and save part
    a0, a1, a2 = compress_gamma(orginal_trie)
    save_bitstring(a1, "a1.txt")
    save_bitstring(a2, "a2.txt")
    f = open("a0.pkl", "wb")
    pickle.dump(a0, f)
    f.close()
    print("Posting Lists Compressed and Saved in a0.pkl, a1.txt, a2.txt")

    ### load and decompress part
    a1 = load_bitsring("a1.txt")
    a2 = load_bitsring("a2.txt")
    f = open("a0.pkl", "rb")
    a0 = pickle.load(f)
    f.close()
    recovered_trie = decompress_gamma(a0, a1, a2)
    print("Posting Lists Compressed and Loaded and Decompressed")

    if check_equality_tri(orginal_trie, recovered_trie):
        print("Orginal Version and Recovered Version Are Identical")
    else:
        print("Orginal Version and Recovered Version Are Not Identical!!!")
コード例 #2
0
def prepare_data_as_list():
    filename = "Data.csv"
    trie_dict, _ = dictionary.build_dictionary(english_or_persian=LANGUAGE,
                                               filename=filename)
    trie_list = trie_dict.to_list()
    trie_list.sort(key=lambda x: x.term)
    term_and_posting_list = [(a.term, a.posting_list) for a in trie_list]
    return term_and_posting_list
コード例 #3
0
def prepare_data_as_list(train_not_test=True):
    if train_not_test:
        filename = "phase2_train.csv"
    else:
        filename = "phase2_test.csv"
    trie_dict, _ = dictionary.build_dictionary(english_or_persian=LANGUAGE, filename=filename)
    trie_list = trie_dict.to_list()
    trie_list.sort(key=lambda x: x.term)
    term_and_posting_list = [(a.term, a.posting_list) for a in trie_list]
    return term_and_posting_list
コード例 #4
0
def part3_1and2(trie_ds: dictionary.Trie):
    if trie_ds is None:
        trie_ds, _ = dictionary.build_dictionary()
    a0, a1, a2 = compress_VB(trie_ds)
    ByteSize_VB = len(a1) + len(a2)

    a0, a1, a2 = compress_gamma(trie_ds)
    ByteSize_gamma = int((a1.len + a2.len) / 8)
    ByteSize_initial = asizeof.asizeof(trie_ds)
    print("Engilish Corpus:", "        Whole Initial ByteSize=  ", ByteSize_initial, "     VB ByteSize=  ", ByteSize_VB,
          "     gamma ByteSize=  ", ByteSize_gamma, "     term_to_ptr ByteSize =  ", asizeof.asizeof(a0))
コード例 #5
0
def main():
    global term_to_num, idf_query, doc_space, bigram_ds, trie_dict
    input("press any button!")
    dic_is_built = False
    while True:
        lang = int(input("choose language:\n" "1- english\n" "2- persian"))
        if lang == 1:
            PREPROCESSOR = english_preprocessing
            all_docs_and_titles = file_handler.load_english_file()
        else:
            PREPROCESSOR = persian_preproccessing
            all_docs_and_titles = file_handler.load_persian_file()
        command: int = int(
            input("choose:\n"
                  "1- pre process my text\n"
                  "2- show frequent words\n"
                  "3- go to dict\n"))
        if command == 1:
            text = (input("enter your text:\n"))
            print("**pre processed:", PREPROCESSOR.pre_process_text(text))
        elif command == 2:
            rank = int(input("how many?"))
            tf_token_pairs = PREPROCESSOR.get_all_docs_tf_tokens()
            for pair in tf_token_pairs[:rank]:
                print(pair[0], ": ", pair[1])
        elif command == 3:
            if not dic_is_built:
                print("building dict...")
                trie_dict, bigram_ds = dictionary.build_dictionary(lang)
                doc_space, idf_query, term_to_num = part5.vector_space_preprocess(
                    trie_dict)
                dic_is_built = True
            command2 = int(
                input("1- show posting for my word\n"
                      "2- show positions of word in all docs\n"
                      "3- remove the doc\n"
                      "4- correct query\n"
                      "5- test compression \n"
                      "6- test compression recoverability\n"
                      "7- search query\n"
                      "8- search query (proximity)"))
            if command2 == 1:
                word = input("enter the word:\n")
                dictionary.show_posting_list(word, trie_dict)
            elif command2 == 2:
                word = input("enter the word:\n")
                dictionary.show_positions_in_all_docs(word, trie_dict)
            elif command2 == 3:
                doc_id = int(input("enter doc id:"))
                dictionary.remove_doc(all_docs_and_titles, doc_id, trie_dict)
            elif command2 == 4:
                query = input("enter [bad] query:")
                corrected = ""
                for term in PREPROCESSOR.simple_tokenize_and_remove_junk(
                        query):
                    corrected = corrected + " " + Part4.word_correction(
                        term, trie_dict, bigram_ds)
                print("corrected: ", corrected)
            elif command2 == 5:
                Part3.part3_1and2(trie_dict)
            elif command2 == 6:
                Part3.part3_3(trie_dict)
            elif command2 == 7:
                query = input("inout your query: ")
                corrected = ""
                for term in PREPROCESSOR.simple_tokenize_and_remove_junk(
                        query):
                    corrected = corrected + " " + Part4.word_correction(
                        term, trie_dict, bigram_ds)
                query_tokens = PREPROCESSOR.simple_tokenize_and_remove_junk(
                    corrected)
                relev_docs = part5.get_related_docId_list(
                    query_tokens, idf_query, term_to_num, doc_space)
                print("results: ", relev_docs)
            elif command2 == 8:
                query = input("inout your query: ")
                corrected = ""
                for term in PREPROCESSOR.simple_tokenize_and_remove_junk(
                        query):
                    corrected = corrected + " " + Part4.word_correction(
                        term, trie_dict, bigram_ds)
                query_tokens = PREPROCESSOR.simple_tokenize_and_remove_junk(
                    corrected)
                relev_docs = part5.get_related_docId_list_proximity_version(
                    query_tokens, idf_query, term_to_num, doc_space, trie_dict)
                print("results: ", relev_docs)
コード例 #6
0
ファイル: Part4.py プロジェクト: hbaktash/MIR_class_project
            if not (term in intersection_map):
                intersection_map[term] = set()
            intersection_map[term].add(bigram)
            union_map[term].add(bigram)

    for term in list(union_map.keys()):
        term_bi_list = list(set(dictionary.Bigram.get_bis_of_term(term)))
        for bigram in term_bi_list:
            union_map[term].add(bigram)

    jaccard_score_list = []

    for term in list(union_map.keys()):
        jaccard_score_list.append(
            (term, 1.0 * len(intersection_map[term]) / len(union_map[term])))

    jaccard_score_list.sort(key=lambda tup: tup[1], reverse=True)
    jaccard_score_list = jaccard_score_list[0:KK]
    editDistance_score_list = []
    for i in range(KK):
        term = jaccard_score_list[i][0]
        editDistance_score_list.append((term, edit_distance(wrong_word, term)))

    editDistance_score_list.sort(key=lambda tup: tup[1], reverse=False)
    return editDistance_score_list[0][0]


trie_d, bigram = dictionary.build_dictionary()
print("dic built")
print(word_correction("build", trie_d, bigram))