def open_t(filename):
    with open(get_path("/t_models/" + filename), "r") as file:
        first_line = file.readline()
        t = defaultdict(lambda: float(first_line))
        for line in file.readlines():
            line = line.strip("\n")
            k, v = line.split(" q:q ")
            t[tuple(k.split("qq"))] = float(v)
    return t
def get_translations_results(filename):
    predicts: dict = defaultdict(list)
    ed_re = re.compile(r"Edit distance = (\d+)\n")
    pred_re = re.compile(r"predict: (.*)\n")
    fold = 0
    with open(get_path(filename), "r") as file:
        for line in file.readlines():
            if ed_re.match(line):
                ed = int(ed_re.match(line).group(1))
                fold += 1
            elif pred_re.match(line):
                pred = pred_re.match(line).group(1)
                predicts[fold].append(pred)
            else:
                print("non matched line: ", line)
    return predicts
Exemple #3
0
def get_programming_symbols_map():
    # source = https://blog.codinghorror.com/ascii-pronunciation-rules-for-programmers/
    symbol_to_name = {}
    with open(get_path("data/programming_symbols.csv"), "r") as csvfile:
        csvfile.readline()
        rows = csv.reader(csvfile, delimiter=",")
        current_symbol = ""
        for row in rows:
            if not row[0][0].isalpha():
                if current_symbol:
                    if current_symbol == "\\\"":
                        symbol_to_name["\""] = current_names
                    else:
                        symbol_to_name[current_symbol] = current_names
                current_symbol = row[0]
                current_names = set()
                row = row[1:]
            for cell in row:
                current_names.update([
                    re.sub(r'\(.*\)', "", x.strip(" "))
                    for x in cell.split("\n")
                ])
    keys_to_remove = []
    new_symbols = {}
    for key in symbol_to_name.keys():
        if " " in key:
            keys_to_remove.append(key)
            symbols = key.split(" ")
            values = [set(), set()]
            for name in symbol_to_name[key]:
                name.replace(" / ", "/")
                individual = [x.strip(" ") for x in name.split("/")]
                text = re.search(r' .*$', individual[1])
                if text:
                    individual[0] += text.group(0)
                values[0].add(individual[0])
                values[1].add(individual[1])
            new_symbols[symbols[0]] = values[0]
            new_symbols[symbols[1]] = values[1]
    for key in keys_to_remove:
        symbol_to_name.pop(key)
    symbol_to_name.update(new_symbols)
    return symbol_to_name
def get_translations_omega(filename):
    predicts: dict = {}
    omega = -1
    ed_re = re.compile(r"Edit distance = (\d+)\n")
    o_re = re.compile(r"omega ([0-9.]+)\n")
    pred_re = re.compile(r"predict: (.*)\n")
    omega_preds = []
    with open(get_path(filename), "r") as file:
        for line in file.readlines():
            if ed_re.match(line):
                ed = int(ed_re.match(line).group(1))
            elif o_re.match(line):
                if omega != -1:
                    predicts[omega] = omega_preds
                omega_preds = []
                omega = float(o_re.match(line).group(1))
            elif pred_re.match(line):
                pred = pred_re.match(line).group(1)
                omega_preds.append(pred)
            # print(line)
    predicts[omega] = omega_preds
    return predicts
def open_phrase_table(filename):
    error_fn = lambda: print("ERROR: file not formed correctly")
    state = "DEFAULT"
    current_f = "qq"
    with open(get_path("/phrase_table/"+filename),'r') as file:
        for line in file.readlines():
            line = line.strip("\n")
            if state == "DEFAULT":
                if line.startswith("Default = "):
                    default_val = float(line[len("Default = "):])
                    phrase_table = defaultdict(lambda : defaultdict(lambda : default_val))
                    state = "FIND_VALS"
                else:
                    error_fn()
                    break
            elif state == "FIND_VALS":
                if line.startswith("********** f"):
                    current_f = line.split(" = ")[1]
                elif ": " in line:
                    e,prob = line.rsplit(": ",1)
                    phrase_table[current_f][e] = float(prob)
    return phrase_table
def get_edit_distances_from_file(filename):
    ed_re = re.compile(r"Edit distance = (\d+)\n")
    o_re = re.compile(r"omega ([0-9.]+)\n")
    both_re = re.compile(r"Edit distance = (\d+)omega ([0-9.]+)\n")
    results = []
    omega = -1
    with open(get_path(filename), "r") as file:
        for line in file.readlines():
            if ed_re.match(line):
                ed = int(ed_re.match(line).group(1))
                if omega != -1:
                    results.append((omega, ed))
            elif o_re.match(line):
                omega = float(o_re.match(line).group(1))
            elif both_re.match(line):
                match = both_re.match(line)
                ed = int(match.group(1))
                if omega != -1:
                    results.append((omega, ed))
                omega = float(match.group(2))
            # else:
            #     print(line)
    return results
def get_rule_based_translations_from_file(filename):
    predicts = []
    with open(get_path(filename), "r") as file:
        for line in file.readlines():
            predicts.append(line.strip("\n").split(" "))
    return predicts
        # print(test_pair)
        splits = get_splits([test_pair], "enhanced")
        # for split in splits:
        #     print(" ".join(split[0]))
        #     print(" ".join(split[1]))
        #     print()
    elif test_num == 15:
        results = get_results_from_file("logs/results_split_v2.txt", "split")
        print(results)
        results = get_results_for_traditional_files(
            "logs/results_split_v2.txt", "split")
        print(results)
        print(sum(results[0]))
    else:
        # RESULTS
        log_result_files = os.listdir(get_path("logs"))
        log_result_files = [
            x for x in log_result_files
            if x.startswith("results") and not x.endswith("logs")
        ]

        for filename in log_result_files:
            # if filename != "results_enhanced_new.txt":
            #     continue
            type_split = str(filename.split("_")[1])
            if "enhanced" in filename:
                type_split = "enhanced"
            elif "split" in filename:
                type_split = "split"
            else:
                type_split = "none"
def get_programming_symbols_map():
    # source = https://blog.codinghorror.com/ascii-pronunciation-rules-for-programmers/
    symbol_to_name = {}
    with open(get_path("data/programming_symbols.csv"), "r") as csvfile:
        csvfile.readline()
        rows = csv.reader(csvfile, delimiter=",")
        current_symbol = ""
        for row in rows:
            if not row[0][0].isalpha():
                if current_symbol:
                    if current_symbol == "\\\"":
                        symbol_to_name["\""] = current_names
                    else:
                        symbol_to_name[current_symbol] = current_names
                current_symbol = row[0]
                current_names = set()
                row = row[1:]
            for cell in row:
                current_names.update([re.sub(r'\(.*\)',"",x.strip(" ")) for x in cell.split("\n")])
    keys_to_remove = []
    new_symbols = {}
    for key in symbol_to_name.keys():
        if " " in key:
            keys_to_remove.append(key)
            symbols = key.split(" ")
            values = [set(),set()]
            for name in symbol_to_name[key]:
                name.replace(" / ","/")
                individual = [x.strip(" ") for x in name.split("/")]
                text = re.search(r' .*$',individual[1])
                if text:
                    individual[0] += text.group(0)
                values[0].add(individual[0])
                values[1].add(individual[1])
            new_symbols[symbols[0]] = values[0]
            new_symbols[symbols[1]] = values[1]
    for key in keys_to_remove:
        symbol_to_name.pop(key)
    symbol_to_name.update(new_symbols)
    # symbol_to_name["\\n"] = set(["newline", "backslash n"])
    # symbol_to_name["*"].add("multiplied by")
    # symbol_to_name["*"].add("multiply")
    # symbol_to_name["*"].add("times by")
    # symbol_to_name["%"].add("percent")
    # symbol_to_name["-"].add("subtract")
    # symbol_to_name["="].add("equal")
    # symbol_to_name["="].add("is")
    # symbol_to_name["="].remove("gets")
    # symbol_to_name["="].remove("takes")
    # symbol_to_name["="].add("is equal to")
    # symbol_to_name["="].add("is set to")
    # symbol_to_name["/"].add("divided by")
    # symbol_to_name["/"].add("divided")
    # symbol_to_name["/"].add("divide")
    # symbol_to_name["/"].add("div")
    # symbol_to_name[">"].add("is greater than")
    # symbol_to_name[">"].add("larger than")
    # symbol_to_name[">"].add("bigger than")
    # symbol_to_name["<"].remove("from")
    # symbol_to_name[">"].remove("into")
    # symbol_to_name["<"].add("is less than")
    # symbol_to_name["<"].add("smaller than")
    # symbol_to_name["("].add("open bracket")
    # symbol_to_name[")"].add("close bracket")
    # symbol_to_name["["].add("square bracket")
    # symbol_to_name["["].add("open square bracket")
    # symbol_to_name["["].remove("opening bracket")
    # symbol_to_name["]"].remove("closing bracket")
    # symbol_to_name["]"].add("close square bracket")
    # symbol_to_name[":"].remove("dots")
    return symbol_to_name
def save_t(t: defaultdict, filename):
    with open(get_path("/t_models/" + filename), "w") as file:
        file.write(str(t.default_factory()) + "\n")
        for k, v in t.items():
            file.write(str(k[0]) + "qq" + str(k[1]) + " q:q " + str(v) + "\n")
Exemple #11
0
if __name__ == "__main__":
    toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_2)
    # for tok, dep in zip(toks, deps):
    #     out = get_output_string(tok,dep)
    #     print(out)

    # for i in range (40,49):
    #     print("***************",i)
    #     print(get_output_string(toks[i],deps[i]))

    # toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_1)
    # with open(get_path("/results/traditional_train.txt"),"w+") as file:
    #     for tok,dep in zip(toks[:49],deps[:49]):
    #         file.write(get_output_string(tok,dep))
    #         file.write('\n')

    toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_1)
    with open(get_path("/results/traditional_test1.txt"),"w+") as file:
        for tok,dep in zip(toks[49:],deps[49:]):
            file.write(get_output_string(tok,dep))
            file.write('\n')



    # toks,deps = load_dep_parse.get_token_deps(base_dir=base_dir_2)
    # with open(get_path("/results/traditional_test2.txt"),"w+") as file:
    #     for tok,dep in zip(toks,deps):
    #         file.write(get_output_string(tok,dep))
    #         file.write('\n')
Exemple #12
0
import os

from tools.find_resource_in_project import get_path

log_result_files = os.listdir(get_path("results"))
# log_result_files = [x for x in log_result_files if x.startswith("results") and not x.endswith("logs")]
x = False
for filename in log_result_files:
    print(filename)
    with open(get_path("results/" + filename), "r") as file:
        text = file.read()
    text = text.replace("return", "output")
    with open(get_path("results/" + filename), "w") as file:
        file.write(text)
from collections import defaultdict
from math import inf,log
from scipy.stats import norm

from tools.find_resource_in_project import get_path

PREVENT_VARIABLE_TO_NULL_MAP = True


D_SIGMA = lambda x: x/1.96
d_cache = {}
d_cache_path = get_path("/default_d_cache/cache.txt")


def load_d_cache():
    count = 0
    with open(d_cache_path, "r") as file:
        for line in file.readlines():
            count += 1
            line.strip("\n")
            maps = line.split(" ")
            if len(maps) < 5:
                print(count,maps,line)
            d_cache[(int(maps[0]),int(maps[1]),int(maps[2]),int(maps[3]))] = float(maps[4])


load_d_cache()
d_cache_file = open(d_cache_path, "a")
def default_d(i,j,l,m):
    if (i,j,l,m) in d_cache:
        return d_cache[(i,j,l,m)]