def evaluate_predictions(**kwargs): distance = 0 diff_count = 0 for row_i, row in search_utils.chinese_names.iterrows(): # if row_i > 10: # break if row_i % 50 == 0: print("{}% complete".format(100 * row_i / search_utils.chinese_names.shape[0])) english, _, _, target_pinyin, _, _, _ = row english = search_utils.normalize(english) target_pinyin = ''.join( filter(lambda x: x != ' ', search_utils.normalize(target_pinyin))) #print(english) if search_utils.is_vowel(english[0]): english = 'S' + english if search_utils.is_vowel(english[-1]): english = english + 'E' output_pinyin = ''.join(find_closest_pinyin(english, **kwargs)[1]) if output_pinyin != target_pinyin: #print (english, output_pinyin, target_pinyin) diff_count += 1 distance += edit_distance.edit_distance_pinyin( target_pinyin, output_pinyin) print( "Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)" .format(search_utils.chinese_names.shape[0], diff_count, distance / search_utils.chinese_names.shape[0], distance / diff_count))
def evaluate_predictions(): all_names = search_utils.chinese_names distance = 0 diff_count = 0 for row_i, row in all_names.iterrows(): if row_i % 20 == 0: print("{}% complete".format(100 * row_i / all_names.shape[0])) english, _, _, target_pinyin, _, _, _ = row result = baseline(english) result_pinyin = pinyin(result) target_pinyin = ''.join( filter(lambda x: x != ' ', search_utils.normalize(target_pinyin))) output_pinyin = ''.join([seg[0] for seg in result_pinyin]) if output_pinyin != target_pinyin: #print (english, output_pinyin, target_pinyin) diff_count += 1 distance += edit_distance.edit_distance_pinyin( target_pinyin, output_pinyin) print( "Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)" .format(all_names.shape[0], diff_count, distance / all_names.shape[0], distance / diff_count))
def process_baseline(oracle_csv): df = pd.ExcelFile(oracle_csv).parse('Sheet1') df_baseline_pinyin = pd.ExcelFile(os.path.join("..", "data", "proposal", "BaselineResponses.xlsx")).parse('Sheet1') #df = pd.read_csv(oracle_csv) names = df["English"] o1 = df["Pinyin_O1"] o2 = df["Pinyin_O2"] bp = df_baseline_pinyin["Baseline"] distance = 0 diff_count = 0 for name, name1, name2, pinyin in zip(names, o1, o2, bp): if name != name1 or name != name2: diff_count += 1 baseline_guess = baseline.baseline(name) print(baseline_guess) dist_o1 = edit_distance.edit_distance_pinyin(pinyin, name1) print("Distance between", pinyin, "and", name1, ":", dist_o1) dist_o2 = edit_distance.edit_distance_pinyin(pinyin, name2) print("Distance between", pinyin, "and", name2, ":", dist_o2) distance += ((dist_o1 + dist_o2) / 2) # take the average over ALL names return (distance/len(names), diff_count, len(names))
def process_oracle(oracle_csv): df = pd.ExcelFile(oracle_csv).parse('Sheet1') #df = pd.read_csv(oracle_csv) o1 = df["Pinyin_O1"] o2 = df["Pinyin_O2"] distance = 0 diff_count = 0 for name1, name2 in zip(o1, o2): if name1 != name2: diff_count += 1 dist = edit_distance.edit_distance_pinyin(name1, name2) # 1. penalizes wrong tones as 0.5 # 2. doesn't do the "count characters in common" thing because these are longer # only print stuff that's different print("Distance between", name1, "and", name2, ":", dist) distance += dist # take the average over ALL names, not just the ones that were wrong return (distance/len(o1), diff_count, len(o1))
def evaluateAllLines(encoder, decoder): distance = 0 diff_count = 0 for pair in pairs: output_name = evaluate(encoder, decoder, pair[0]) # remove the space for edit distance calculations for consistency with baseline output_name = ''.join(filter( lambda l: l != ' ', output_name[:-1])) # need to get rid of the <EOS> string at end target_name = ''.join(filter(lambda l: l != ' ', pair[1])) if output_name != target_name: #print(output_name, target_name) diff_count += 1 distance += edit_distance.edit_distance_pinyin( target_name, output_name) print( "Out of {} names, {} were different, with an average edit distance of {} ({} for just the different pairs)" .format(len(pairs), diff_count, distance / len(pairs), distance / diff_count))