Ejemplo n.º 1
0
 def reset(self):
     global cache_dictionary
     global final_dictionary
     global documents_dictionary
     cache_dictionary = None
     final_dictionary = None
     documents_dictionary = None
     Indexer.reset()
     Writer.reset()
     Stemmer.reset()
     Reader.reset()
Ejemplo n.º 2
0
def merge_all_posting(stemming_mode, posting_id, number_doc_in_corpus,
                      the_final_terms_dictionary, cach_dictionary, all_city,
                      max_doc_city):
    #check_uppercase()
    path_folder_posting, path_folder_abc_posting, stemming_mode, city_path = init_path(
        stemming_mode)
    print("merge_all_posting")
    finish = False
    number_of_line_in_abc_posting = {}
    all_final_posting_path = create_final_posting(
        path_folder_abc_posting, number_of_line_in_abc_posting, city_path)
    term_first_line_postings = {}
    freq_sum_doc_first_line_postings = {}
    the_open_posting_file = {}
    stemm_dictionary_values = []
    if stemming_mode == 'yes':
        stemm_dictionary = Stemmer.get_dictionary()  # all stemming_term
        stemm_dictionary_values = Stemmer.get_dictionary_value()
    elif stemming_mode == 'no':
        stemm_dictionary = Stemmer.get_dictionary_without_stemming(
        )  # all stemming_term
    cach_dictionary.clear()
    terms_to_updated = {}  # The terms are in lower case letters

    close_file = {}
    # save the first line of each temp posting
    for index_file_of_posting in range(1, posting_id + 1):
        file_path = path_folder_posting + "\TempPostings" + str(
            index_file_of_posting) + '.txt'
        curr_posting_file = open(file_path, "r")
        the_open_posting_file[index_file_of_posting] = curr_posting_file
        close_file[index_file_of_posting] = False
        find_first_line(curr_posting_file, index_file_of_posting,
                        term_first_line_postings,
                        freq_sum_doc_first_line_postings, close_file)

    while not finish:
        #min_temp_posting = min(term_first_line_postings.keys(), key=(lambda index_post: term_first_line_postings[index_post]))
        min_term = min(term_first_line_postings.values())
        all_posting_file_with_equal_term = []
        list_doc = {}
        sum_tf = 0
        df = 0
        for index, term in term_first_line_postings.items():
            if min_term == term:
                all_posting_file_with_equal_term.append(index)
                sum_tf = sum_tf + int(
                    (freq_sum_doc_first_line_postings[index])[0])
                df = df + int((freq_sum_doc_first_line_postings[index])[1])
                list_doc.update((freq_sum_doc_first_line_postings[index])[2])
        # Handling capitalization !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        if min_term[0].isupper():  # Party # The
            lowercase_term = min_term.lower()  # party # the
            if lowercase_term in stemm_dictionary:
                if stemming_mode == 'yes':
                    lowercase_term_after_stemm = stemm_dictionary[
                        lowercase_term]  # parti # the
                else:
                    lowercase_term_after_stemm = lowercase_term
                if lowercase_term_after_stemm in terms_to_updated:
                    sum_tf = sum_tf + terms_to_updated[
                        lowercase_term_after_stemm][0]
                    list_doc.update(
                        terms_to_updated[lowercase_term_after_stemm][1])
                    terms_to_updated[lowercase_term_after_stemm] = (sum_tf,
                                                                    list_doc)
                else:
                    terms_to_updated[lowercase_term_after_stemm] = (sum_tf,
                                                                    list_doc)
            elif stemming_mode == 'yes' and lowercase_term in stemm_dictionary_values:
                if lowercase_term in terms_to_updated:
                    sum_tf = sum_tf + terms_to_updated[lowercase_term][0]
                    list_doc.update(terms_to_updated[lowercase_term][1])
                    terms_to_updated[lowercase_term] = (sum_tf, list_doc)
                else:
                    terms_to_updated[lowercase_term] = (sum_tf, list_doc)
            else:
                cach_dictionary[min_term] = sum_tf
                calculations_and_income_to_final_dictionary(
                    list_doc, sum_tf, df, number_doc_in_corpus, min_term,
                    all_final_posting_path, number_of_line_in_abc_posting,
                    the_final_terms_dictionary, all_city, max_doc_city)
        else:
            if min_term in terms_to_updated:  # parti #the
                sum_tf = sum_tf + terms_to_updated[min_term][0]
                cach_dictionary[min_term] = sum_tf
                list_doc.update(terms_to_updated[min_term][1])
                #print("final posting: " + min_term)
                calculations_and_income_to_final_dictionary(
                    list_doc, sum_tf, df, number_doc_in_corpus, min_term,
                    all_final_posting_path, number_of_line_in_abc_posting,
                    the_final_terms_dictionary, all_city, max_doc_city)
            else:
                #print("final posting: " + min_term)
                cach_dictionary[min_term] = sum_tf
                calculations_and_income_to_final_dictionary(
                    list_doc, sum_tf, df, number_doc_in_corpus, min_term,
                    all_final_posting_path, number_of_line_in_abc_posting,
                    the_final_terms_dictionary, all_city, max_doc_city)

        for i in all_posting_file_with_equal_term:
            find_first_line(the_open_posting_file[i], i,
                            term_first_line_postings,
                            freq_sum_doc_first_line_postings, close_file)

        finish = check_if_finish(close_file)

    ## out while
    close_all_files(all_final_posting_path)
    Stemmer.reset()
    reset_temp_posting()
    return sum_numbers