def _merger_dictionary(name): readFile.clear() readFile._read_file(name) # Unigram print("Add uni") for idx, item in readFile.unigramDict.items(): if idx in UNIGRAM_DICT: if "count" in item: if "count" in UNIGRAM_DICT[idx]: UNIGRAM_DICT[idx][ "count"] = UNIGRAM_DICT[idx]["count"] + item["count"] else: UNIGRAM_DICT[idx] = copy.deepcopy(item) else: UNIGRAM_DICT[idx] = copy.deepcopy(item) # Ngram print("Add nrg ") for idx, item in readFile.ngramDict.items(): if idx in NGRAM_DICT: if "count" in item: if "count" in NGRAM_DICT[idx]: NGRAM_DICT[idx][ "count"] = NGRAM_DICT[idx]["count"] + item["count"] else: NGRAM_DICT[idx] = copy.deepcopy(item) else: NGRAM_DICT[idx] = copy.deepcopy(item)
def _main_v3(s): i = 0 for file in os.listdir(s): print(file + " {}/{}".format(i,len(os.listdir(s)))) readFile.clear() readFile._read_file(s + file) # _merge_dic_to_csv(readFile.unigramDict, True) # _merge_dic_to_csv(readFile.ngramDict, False) _merge_dic_to_csv_v2(readFile.unigramDict, True) _merge_dic_to_csv_v2(readFile.ngramDict, False) i = i + 1
def _tool(): readFile.clear() readFile._read_file("DicTest.txt") global UNIGRAM_DICT global NGRAM_DICT UNIGRAM_DICT = copy.deepcopy(readFile.unigramDict) NGRAM_DICT = copy.deepcopy(readFile.ngramDict) # UNIGRAM_DICT = copy.deepcopy(_read_dic_to_json('unigram_json.txt')) # NGRAM_DICT = copy.deepcopy(_read_dic_to_json('ngram_json.txt')) print("start") for x in range(20, 21): # if x <= 9: # s = path + "0" + str(x) + folder + "/" # else: # s = path + str(x) + folder + "/" s = s = path + folder + "/" print(s) _main_v2(s) print("get pro uni") readFile._get_probability_uni(UNIGRAM_DICT) print("get pro ngr") readFile._get_probability_ngram(UNIGRAM_DICT, NGRAM_DICT) writeFile._write_dictionary(UNIGRAM_DICT, NGRAM_DICT, "DicTest.txt")
def _first_run_v2(): readFile.clear() readFile._read_file('text.txt') _merge_dic_to_csv_v2(readFile.unigramDict, True) _merge_dic_to_csv_v2(readFile.ngramDict, False)
def _first_run(): readFile.clear() readFile._read_file('main_vi.dict_chinh_quy_bo_ky_tu.txt') _merge_dic_to_csv(readFile.unigramDict, True) _merge_dic_to_csv(readFile.ngramDict, False)