def get_nonprojectivity_ratios(): lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False) np = dict() tot = dict() for lang, path in lang_trees.items(): trees = list(udtree.from_files(path)) tot[lang] = len(trees) np[lang] = get_np_trees(trees) rel = {} for lang, n in zip(np.keys(), np.values()): rel[lang] = len(n) / tot[lang] return rel
#!/usr/local/bin/python3 # TODO: Convert conll-u to conll-x and then back again # To run MaltOptimizer, you have to be in the MaltOptimizer directory: # cd {project_base}/tools/MaltOptimizer-1.0.3 # ~/dev/miniconda/bin/python3 ../train_maltparser.py from subprocess import call from os.path import join from os import rename import lang_utils treebank_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/resources/universaldependencies1-2/universal-dependencies-1.2/" train_files = lang_utils.get_ud_paths(treebank_base, type_="train", format_="conllx") train_files = {"Czech": train_files['Czech']} project_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/" maltparser_path = join(project_base, "tools", "maltparser-1.8.1", "maltparser-1.8.1.jar") maltoptimizer_path = join(project_base, "tools", "MaltOptimizer-1.0.3", "MaltOptimizer.jar") base_cmd = ["java", "-Xmx8G"] # 8G is only necessary for Czech jar_path = ["-jar", maltparser_path] mode = ["-m", "learn"] for lang, train_file in train_files.items(): print("Training language {}".format(lang)) training_path = ["-i", train_file[0]] model_path = ["-c", "ud-1.2." + lang] call(base_cmd + jar_path + ["-grl", "root"] + model_path + mode + training_path)
# TODO: Convert conll-u to conll-x and then back again from subprocess import call from os.path import join from os import remove from shutil import copyfile import lang_utils project_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/" treebank_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/resources/universaldependencies1-2/universal-dependencies-1.2/" test_files = lang_utils.get_ud_paths(treebank_base, type_="dev", format_="conllx") maltparser_path = join(project_base, "tools", "maltparser-1.8.1", "maltparser-1.8.1.jar") base_cmd = ["java", "-Xmx8G"] jar_path = ["-jar", maltparser_path] mode = ["-m", "parse"] for lang, test_file in test_files.items(): # Get language's parsing model print("Parsing {} from {}".format(lang, test_file)) model_path = ["-c", "ud-1.2." + lang] copyfile(join(project_base, "resources", "maltdefault_models_1-2", model_path[1] + ".mco"), model_path[1] + ".mco") test_path = ["-i", join(treebank_base, lang, test_file[0])] output_path = ["-o", join(project_base, "resources", "maltdefault_output_dev_1-2", lang + ".conllx")] call(base_cmd + jar_path + model_path + mode + test_path + output_path) remove(model_path[1] + ".mco") # java -jar maltparser-1.8.1.jar -c es-model -m learn -i ../../resources/universaldependencies1-1/ud-treebanks-v1.1/UD_Spanish/es-ud-dev.conllx
for i, head in zip(tree.ids, tree.heads): if head-1 < 0: continue else: for j, inner_head in zip(tree.ids[i:head-1], tree.heads[i:head-1]): if inner_head < i or inner_head > head: is_non_projective = True return is_non_projective def get_np_trees(trees): np = [] for tree in trees: if is_non_projective(tree): np.append(tree) return np def get_non-projectivity_ratios(): lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False) np = dict() tot = dict() for lang, path in lang_trees.items(): trees = list(udtree.from_files(path)) tot[lang] = len(trees) np[lang] = get_np_trees(trees) rel = {} for lang, n in zip(np.keys(), np.values()): rel[lang] = len(n) / tot[lang]
# TODO: Convert conll-u to conll-x and then back again from subprocess import call from os.path import join from os import remove from shutil import copyfile import lang_utils project_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/" treebank_base = "/Users/jimmy/dev/edu/nlp-rod/udeval/resources/universaldependencies1-2/universal-dependencies-1.2/" test_files = lang_utils.get_ud_paths(treebank_base, type_="test", format_="conllx", coarse=True) maltparser_path = join(project_base, "tools", "maltparser-1.8.1", "maltparser-1.8.1.jar") base_cmd = ["java", "-Xmx8G"] jar_path = ["-jar", maltparser_path] mode = ["-m", "parse"] for lang, test_file in test_files.items(): # Get language's parsing model print("Parsing {} from {}".format(lang, test_file)) model_path = ["-c", "ud-1.2." + lang] copyfile(join(project_base, "resources", "maltdefault_coarse_models_1-2", model_path[1] + ".mco"), model_path[1] + ".mco") test_path = ["-i", join(treebank_base, lang, test_file[0])] output_path = ["-o", join(project_base, "resources", "maltdefault_coarse_output_test_1-2", lang + ".conllx")] call(base_cmd + jar_path + model_path + mode + test_path + output_path) remove(model_path[1] + ".mco") # java -jar maltparser-1.8.1.jar -c es-model -m learn -i ../../resources/universaldependencies1-1/ud-treebanks-v1.1/UD_Spanish/es-ud-dev.conllx
def convert(to_convert_files): for lang, files in to_convert_files.items(): file_ending = ".conllx" if not fine_grained_deprels: file_ending = ".coarse_deprels.conllx" if lang == "Czech" and len(files) > 1: file_name = "cs-ud-train" + file_ending else: file_name = files[0].split("/")[-1].split(".")[0] + file_ending outfile = join(project_base, "UD_" + lang, file_name) trees = udtree.from_files(files) with open (outfile, "w") as w: for tree in trees: for word in tree.sentence_structure: word['deps'] = None word['misc'] = None if not any(tree.postags): # Copy CPOSTAG to POSTAG tree.postags = tree.cpostags w.write("\n".join(tree.to_conllx_format(fine_grained_deprels=fine_grained_deprels)) + "\n\n") train_files = lang_utils.get_ud_paths(project_base, type_="train", format_="conllu", coarse=False) dev_files = lang_utils.get_ud_paths(project_base, type_="dev", format_="conllu", coarse=False) test_files = lang_utils.get_ud_paths(project_base, type_="test", format_="conllu", coarse=False) convert(train_files) convert(dev_files) convert(test_files)