def gen_2gram_dict_separated(): data = csv_wrapper.load_csv('2gram_original_no_edit.csv') separated_2gram_dict = { word: int(count) for count, word, gram_type in data } return separated_2gram_dict
def show_good_data_rows(): home_keys = 'うたんとかの。し' good_data_rows = csv_wrapper.load_csv('good_data_rows.csv') prev_cost1each = 0 prev_cost2each = 0 for row in good_data_rows[:100]: print_list = [] cost2fin, layer1keys, layer2keys = row print_list.append(home_keys) print_list.append(layer1keys) print_list.append(layer2keys) cost1each = [ gram_data.get_1gram_cost(keys) for keys in zip(home_keys, layer1keys, layer2keys) ] print_list.append(cost1each) cost2each = [ gram_data.get_2gram_cost(keys) for keys in zip(home_keys, layer1keys, layer2keys) ] print_list.append(cost2each) cost1all = sum(cost1each) if prev_cost1each == cost1each and prev_cost2each == cost2each: print('same') continue prev_cost1each = cost1each prev_cost2each = cost2each print(tabulate(print_list)) print(cost2fin, cost1all) break
def gen_good_data_rows(): home_keys = 'うたんとかの。し' not_home_keys = 'てくなにきはこるがでっょすま:;' cost_dict = gen_cost_dict(home_keys, not_home_keys) csv_ = csv_wrapper.load_csv('final_output.csv') petterns_ = [[1 if i == e else 0 for i in ints for e in range(8)] for ints in permutations(range(8))] # petterns_ = petterns_[:100] petterns = list(zip(*petterns_)) np_petterns = np.array(petterns) good_data_rows = [] for row in tqdm(csv_): tuple_str, score = row second_keys = eval(tuple_str) rest_keys = list(set(not_home_keys) - set(second_keys)) cost_list = [ cost_dict[(hk, ok, rk)] for hk, ok in zip(home_keys, second_keys) for rk in rest_keys ] costs = np.dot(cost_list, np_petterns) costs_with_permutation = [ (cost, second_keys, rest_key_pattern) for cost, rest_key_pattern in zip(costs, permutations(rest_keys)) ] good_data_rows.extend(costs_with_permutation) good_data_rows.sort(key=itemgetter(0), reverse=False) good_data_rows = good_data_rows[:10000] good_data_rows = [ (cost, ''.join(second_keys), ''.join(rest_key_pattern)) for cost, second_keys, rest_key_pattern in good_data_rows ] csv_wrapper.save_csv('good_data_rows.csv', good_data_rows)
def _save_1gram(): data = csv_wrapper.load_csv('1gram_original_no_edit.csv') dict_ = dict() for count, word, gram_type in data: count = int(count) dict_[word] = count del dict_['〓'] dict_['。'] += dict_['、'] del dict_['、'] print(dict_) csv_wrapper.save_csv('1gram.csv', dict_.items())
def calc_hand_of_first_layer(): home_keys = 'うたんとかの。し' good_data_rows = csv_wrapper.load_csv('good_data_rows.csv') """ ----- ----- ------ ----- ----- ----- ----- ----- う た ん と か の 。 し く な る て き に : は ; で っ ま こ が ょ す 86814 83163 104495 85031 89982 88830 73629 93979 339 1361 549 1191 1470 2372 82 2257 ----- ----- ------ ----- ----- ----- ----- ----- 9621 705923 """ for row in good_data_rows[:100]: print_list = [] cost2fin, layer1keys, layer2keys = row break fingers = [keys for keys in zip(home_keys, layer1keys, layer2keys)] fingers.extend([ 'い', ]) data_rows = [] for r_fins in combinations(fingers, 5): l_fins = set(fingers) - set(r_fins) # print(r_fins) r_letters = flatten(r_fins) l_letters = flatten(l_fins) if 'い' not in r_letters: continue r_hand_1cost = gram_data.get_1gram_cost(r_letters) l_hand_1cost = gram_data.get_1gram_cost(l_letters) r_hand_2cost = gram_data.get_2gram_cost(r_letters) l_hand_2cost = gram_data.get_2gram_cost(l_letters) cross_hand_cost = r_hand_2cost + l_hand_2cost data_rows.append((cross_hand_cost, l_hand_2cost, r_hand_2cost, l_hand_1cost, r_hand_1cost, l_fins, r_fins)) data_rows.sort(key=itemgetter(0), reverse=False) for cross_hand_cost, l_hand_2cost, r_hand_2cost, l_hand_1cost, r_hand_1cost, l_fins, r_fins in data_rows: fin_print = list(l_fins) fin_print.extend(r_fins) fin_print = [''.join(x) for x in fin_print] l_fins_1costs = [gram_data.get_1gram_cost(l_fin) for l_fin in l_fins] r_fins_1costs = [gram_data.get_1gram_cost(r_fin) for r_fin in r_fins] cost_print = list(l_fins_1costs) cost_print.extend(r_fins_1costs) all_print = list(zip(fin_print, cost_print)) all_print = list(zip(*all_print)) print(tabulate(all_print)) print(cross_hand_cost, l_hand_2cost, r_hand_2cost, l_hand_1cost, r_hand_1cost)
def _convert_2gram_both_direction(): data = csv_wrapper.load_csv('2gram_original_no_edit.csv') two_gram_dict = dict() both_direction_two_gram_dict = dict() for count, word, gram_type in data: count = int(count) two_gram_dict[word] = count if word in both_direction_two_gram_dict: both_direction_two_gram_dict[word] += count else: both_direction_two_gram_dict[word] = count reversed_word = word[1] + word[0] if reversed_word in both_direction_two_gram_dict: both_direction_two_gram_dict[reversed_word] += count else: both_direction_two_gram_dict[reversed_word] = count """check sum""" print(two_gram_dict['あい']) print(two_gram_dict['いあ']) print(both_direction_two_gram_dict['あい']) print(both_direction_two_gram_dict['いあ']) """put 0 missing unfamous keys""" one_gram_dict = load_one_gram() keys = one_gram_dict.keys() for k0, k1 in itertools.product(keys, keys): key = ''.join([k0, k1]) if key not in both_direction_two_gram_dict: both_direction_two_gram_dict[key] = 0 """merge 、 and 。 del 、 add 。""" print(both_direction_two_gram_dict['す。']) print(both_direction_two_gram_dict['す、']) for word, count in both_direction_two_gram_dict.items(): if '、' in word: replaced_word = word.replace('、', '。') both_direction_two_gram_dict[replaced_word] += count del_keys = [ word for word, count in both_direction_two_gram_dict.items() if '、' in word ] for del_key in del_keys: del both_direction_two_gram_dict[del_key] print(both_direction_two_gram_dict['す。']) print('す、' in both_direction_two_gram_dict) csv_wrapper.save_csv('2gram_no_order.csv', both_direction_two_gram_dict.items())
def load_one_gram(): one_gram_list = csv_wrapper.load_csv('1gram.csv') one_gram_dict = dict(one_gram_list) one_gram_dict = {k: int(v) for k, v in one_gram_dict.items()} return one_gram_dict
def load_ngram_dict(): one_gram_dict = load_one_gram() two_gram_list = csv_wrapper.load_csv('2gram_no_order.csv') two_gram_dict = dict(two_gram_list) two_gram_dict = {k: int(v) for k, v in two_gram_dict.items()} return one_gram_dict, two_gram_dict