def open_t(filename): with open(get_path("/t_models/" + filename), "r") as file: first_line = file.readline() t = defaultdict(lambda: float(first_line)) for line in file.readlines(): line = line.strip("\n") k, v = line.split(" q:q ") t[tuple(k.split("qq"))] = float(v) return t
def get_translations_results(filename): predicts: dict = defaultdict(list) ed_re = re.compile(r"Edit distance = (\d+)\n") pred_re = re.compile(r"predict: (.*)\n") fold = 0 with open(get_path(filename), "r") as file: for line in file.readlines(): if ed_re.match(line): ed = int(ed_re.match(line).group(1)) fold += 1 elif pred_re.match(line): pred = pred_re.match(line).group(1) predicts[fold].append(pred) else: print("non matched line: ", line) return predicts
def get_programming_symbols_map(): # source = https://blog.codinghorror.com/ascii-pronunciation-rules-for-programmers/ symbol_to_name = {} with open(get_path("data/programming_symbols.csv"), "r") as csvfile: csvfile.readline() rows = csv.reader(csvfile, delimiter=",") current_symbol = "" for row in rows: if not row[0][0].isalpha(): if current_symbol: if current_symbol == "\\\"": symbol_to_name["\""] = current_names else: symbol_to_name[current_symbol] = current_names current_symbol = row[0] current_names = set() row = row[1:] for cell in row: current_names.update([ re.sub(r'\(.*\)', "", x.strip(" ")) for x in cell.split("\n") ]) keys_to_remove = [] new_symbols = {} for key in symbol_to_name.keys(): if " " in key: keys_to_remove.append(key) symbols = key.split(" ") values = [set(), set()] for name in symbol_to_name[key]: name.replace(" / ", "/") individual = [x.strip(" ") for x in name.split("/")] text = re.search(r' .*$', individual[1]) if text: individual[0] += text.group(0) values[0].add(individual[0]) values[1].add(individual[1]) new_symbols[symbols[0]] = values[0] new_symbols[symbols[1]] = values[1] for key in keys_to_remove: symbol_to_name.pop(key) symbol_to_name.update(new_symbols) return symbol_to_name
def get_translations_omega(filename): predicts: dict = {} omega = -1 ed_re = re.compile(r"Edit distance = (\d+)\n") o_re = re.compile(r"omega ([0-9.]+)\n") pred_re = re.compile(r"predict: (.*)\n") omega_preds = [] with open(get_path(filename), "r") as file: for line in file.readlines(): if ed_re.match(line): ed = int(ed_re.match(line).group(1)) elif o_re.match(line): if omega != -1: predicts[omega] = omega_preds omega_preds = [] omega = float(o_re.match(line).group(1)) elif pred_re.match(line): pred = pred_re.match(line).group(1) omega_preds.append(pred) # print(line) predicts[omega] = omega_preds return predicts
def open_phrase_table(filename): error_fn = lambda: print("ERROR: file not formed correctly") state = "DEFAULT" current_f = "qq" with open(get_path("/phrase_table/"+filename),'r') as file: for line in file.readlines(): line = line.strip("\n") if state == "DEFAULT": if line.startswith("Default = "): default_val = float(line[len("Default = "):]) phrase_table = defaultdict(lambda : defaultdict(lambda : default_val)) state = "FIND_VALS" else: error_fn() break elif state == "FIND_VALS": if line.startswith("********** f"): current_f = line.split(" = ")[1] elif ": " in line: e,prob = line.rsplit(": ",1) phrase_table[current_f][e] = float(prob) return phrase_table
def get_edit_distances_from_file(filename): ed_re = re.compile(r"Edit distance = (\d+)\n") o_re = re.compile(r"omega ([0-9.]+)\n") both_re = re.compile(r"Edit distance = (\d+)omega ([0-9.]+)\n") results = [] omega = -1 with open(get_path(filename), "r") as file: for line in file.readlines(): if ed_re.match(line): ed = int(ed_re.match(line).group(1)) if omega != -1: results.append((omega, ed)) elif o_re.match(line): omega = float(o_re.match(line).group(1)) elif both_re.match(line): match = both_re.match(line) ed = int(match.group(1)) if omega != -1: results.append((omega, ed)) omega = float(match.group(2)) # else: # print(line) return results
def get_rule_based_translations_from_file(filename): predicts = [] with open(get_path(filename), "r") as file: for line in file.readlines(): predicts.append(line.strip("\n").split(" ")) return predicts
# print(test_pair) splits = get_splits([test_pair], "enhanced") # for split in splits: # print(" ".join(split[0])) # print(" ".join(split[1])) # print() elif test_num == 15: results = get_results_from_file("logs/results_split_v2.txt", "split") print(results) results = get_results_for_traditional_files( "logs/results_split_v2.txt", "split") print(results) print(sum(results[0])) else: # RESULTS log_result_files = os.listdir(get_path("logs")) log_result_files = [ x for x in log_result_files if x.startswith("results") and not x.endswith("logs") ] for filename in log_result_files: # if filename != "results_enhanced_new.txt": # continue type_split = str(filename.split("_")[1]) if "enhanced" in filename: type_split = "enhanced" elif "split" in filename: type_split = "split" else: type_split = "none"
def get_programming_symbols_map(): # source = https://blog.codinghorror.com/ascii-pronunciation-rules-for-programmers/ symbol_to_name = {} with open(get_path("data/programming_symbols.csv"), "r") as csvfile: csvfile.readline() rows = csv.reader(csvfile, delimiter=",") current_symbol = "" for row in rows: if not row[0][0].isalpha(): if current_symbol: if current_symbol == "\\\"": symbol_to_name["\""] = current_names else: symbol_to_name[current_symbol] = current_names current_symbol = row[0] current_names = set() row = row[1:] for cell in row: current_names.update([re.sub(r'\(.*\)',"",x.strip(" ")) for x in cell.split("\n")]) keys_to_remove = [] new_symbols = {} for key in symbol_to_name.keys(): if " " in key: keys_to_remove.append(key) symbols = key.split(" ") values = [set(),set()] for name in symbol_to_name[key]: name.replace(" / ","/") individual = [x.strip(" ") for x in name.split("/")] text = re.search(r' .*$',individual[1]) if text: individual[0] += text.group(0) values[0].add(individual[0]) values[1].add(individual[1]) new_symbols[symbols[0]] = values[0] new_symbols[symbols[1]] = values[1] for key in keys_to_remove: symbol_to_name.pop(key) symbol_to_name.update(new_symbols) # symbol_to_name["\\n"] = set(["newline", "backslash n"]) # symbol_to_name["*"].add("multiplied by") # symbol_to_name["*"].add("multiply") # symbol_to_name["*"].add("times by") # symbol_to_name["%"].add("percent") # symbol_to_name["-"].add("subtract") # symbol_to_name["="].add("equal") # symbol_to_name["="].add("is") # symbol_to_name["="].remove("gets") # symbol_to_name["="].remove("takes") # symbol_to_name["="].add("is equal to") # symbol_to_name["="].add("is set to") # symbol_to_name["/"].add("divided by") # symbol_to_name["/"].add("divided") # symbol_to_name["/"].add("divide") # symbol_to_name["/"].add("div") # symbol_to_name[">"].add("is greater than") # symbol_to_name[">"].add("larger than") # symbol_to_name[">"].add("bigger than") # symbol_to_name["<"].remove("from") # symbol_to_name[">"].remove("into") # symbol_to_name["<"].add("is less than") # symbol_to_name["<"].add("smaller than") # symbol_to_name["("].add("open bracket") # symbol_to_name[")"].add("close bracket") # symbol_to_name["["].add("square bracket") # symbol_to_name["["].add("open square bracket") # symbol_to_name["["].remove("opening bracket") # symbol_to_name["]"].remove("closing bracket") # symbol_to_name["]"].add("close square bracket") # symbol_to_name[":"].remove("dots") return symbol_to_name
def save_t(t: defaultdict, filename): with open(get_path("/t_models/" + filename), "w") as file: file.write(str(t.default_factory()) + "\n") for k, v in t.items(): file.write(str(k[0]) + "qq" + str(k[1]) + " q:q " + str(v) + "\n")
if __name__ == "__main__": toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_2) # for tok, dep in zip(toks, deps): # out = get_output_string(tok,dep) # print(out) # for i in range (40,49): # print("***************",i) # print(get_output_string(toks[i],deps[i])) # toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_1) # with open(get_path("/results/traditional_train.txt"),"w+") as file: # for tok,dep in zip(toks[:49],deps[:49]): # file.write(get_output_string(tok,dep)) # file.write('\n') toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_1) with open(get_path("/results/traditional_test1.txt"),"w+") as file: for tok,dep in zip(toks[49:],deps[49:]): file.write(get_output_string(tok,dep)) file.write('\n') # toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_2) # with open(get_path("/results/traditional_test2.txt"),"w+") as file: # for tok,dep in zip(toks,deps): # file.write(get_output_string(tok,dep)) # file.write('\n')
import os from tools.find_resource_in_project import get_path log_result_files = os.listdir(get_path("results")) # log_result_files = [x for x in log_result_files if x.startswith("results") and not x.endswith("logs")] x = False for filename in log_result_files: print(filename) with open(get_path("results/" + filename), "r") as file: text = file.read() text = text.replace("return", "output") with open(get_path("results/" + filename), "w") as file: file.write(text)
from collections import defaultdict from math import inf,log from scipy.stats import norm from tools.find_resource_in_project import get_path PREVENT_VARIABLE_TO_NULL_MAP = True D_SIGMA = lambda x: x/1.96 d_cache = {} d_cache_path = get_path("/default_d_cache/cache.txt") def load_d_cache(): count = 0 with open(d_cache_path, "r") as file: for line in file.readlines(): count += 1 line.strip("\n") maps = line.split(" ") if len(maps) < 5: print(count,maps,line) d_cache[(int(maps[0]),int(maps[1]),int(maps[2]),int(maps[3]))] = float(maps[4]) load_d_cache() d_cache_file = open(d_cache_path, "a") def default_d(i,j,l,m): if (i,j,l,m) in d_cache: return d_cache[(i,j,l,m)]