#!/usr/bin/env python3 from morphgnt.utils import load_yaml, sorted_items lexemes = load_yaml("lexemes.yaml") # skip these for now until we work out how to handle them SKIP = ["Ἀππίου", "Λιμήν", "Πάγος", "Πόλις", "Ταβέρνη", "Φόρον"] for lexeme, metadata in sorted_items(lexemes): if "full-citation-form" in metadata and lexeme not in SKIP: lexeme = lexeme.split("/")[0] citation_form = metadata["full-citation-form"] print("{}: {}".format(lexeme, citation_form)) for alt in citation_form.split(" / "): components = alt.split(", ") assert len(components) <= 6 if len(components) == 1: assert components[0] == lexeme elif len(components) == 2: assert components[0] == lexeme assert components[1] in ["ὁ", "ἡ", "τό"] elif len(components) == 3: if components[2].startswith(("acc.", "dat.", "pl.")): assert components[0] == lexeme assert components[1] in ["ὁ", "ἡ", "τό", "τά"] else: assert components[0] == lexeme assert components[2] in [ "ὁ", "ἡ", "τό", "ὁ/ἡ", "ὁ/τό", "οἱ", "αἱ", "τά" ]
#!/usr/bin/env python import sys from pyuca import Collator collator = Collator() from morphgnt.utils import load_yaml from morphgnt.utils import nfkc_normalize as n danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml") greenlee = {} with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f: for line in f: key, value = line.strip().split("\t") greenlee[n(key.decode("utf-8")).split(",")[0]] = { "full-entry": n(key.decode("utf-8")), "components": n(value.decode("utf-8")), } words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))] count = 0 for word in sorted(words, key=collator.sort_key): count += 1 print "{}:".format(word.encode("utf-8")) if word in danker: print " danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8")) print " danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8")) if word in greenlee:
#!/usr/bin/env python import sys from morphgnt import filesets from morphgnt.utils import load_yaml, sorted_items lexemes = load_yaml("lexemes.yaml") forms = load_yaml("forms.yaml") fs = filesets.load("filesets.yaml") for row in fs["sblgnt-lexemes"].rows(): lemma = row["lemma"].decode("utf-8") lexeme = lexemes.get(lemma) if lexeme is None: lemma = "{}/{}".format(row["lemma"], row["ccat-pos"].strip("-")).decode("utf-8") lexeme = lexemes.get(lemma) if lexeme: form = row["norm"].decode("utf-8") if isinstance(lexeme["pos"], list): print >> sys.stderr, lexeme if lexeme["pos"] in ["RA", "A", "N", "RR"]: gender = row["ccat-parse"][6] case_number = row["ccat-parse"][4:6] form_list = forms.setdefault(lemma, {}).setdefault(gender, {}).setdefault(case_number, {}).setdefault("forms", []) if {"form": form} not in form_list: form_list.append({"form": form}) elif lexeme["pos"] in ["RP1"]: case_number = row["ccat-parse"][4:6] form_list = forms.setdefault(lemma, {}).setdefault(case_number, {}).setdefault("forms", []) if {"form": form} not in form_list:
#!/usr/bin/env python3 from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") already = load_wordset("nominal-indeclinable.txt") for lexeme, metadata in sorted_items(lexemes): danker = metadata.get("danker-entry", "") dodson_pos = metadata.get("dodson-pos", "") mounce_morphcat = metadata.get("mounce-morphcat", "") if ( lexeme in already or dodson_pos == "N-PRI" or mounce_morphcat == "n-3g(2)" ): print("{:20}|{:45}|{:10}|{:10}|{:5}".format( lexeme, danker, dodson_pos, mounce_morphcat, "yes" if lexeme in already else "no", )) if lexeme in already: already.remove(lexeme) print(already)
#!/usr/bin/env python3 # coding: utf-8 from difflib import ndiff import sys import unicodedata from morphgnt.utils import load_yaml, sorted_items derivation = load_yaml("derivation.yaml") lexemes = load_yaml("lexemes.yaml") def strip_accents(s): return "".join((c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")) def diff(word1, word2): result = "" state = 0 add = "" sub = "" for x in ndiff(strip_accents(lexeme), strip_accents(other)): if state == 0: if x[:2] == " ": result += "." state = 1 elif x[:2] == "- ": sub += x[2:] state = 2 elif x[:2] == "+ ":
#!/usr/bin/env python3 # coding: utf-8 from difflib import ndiff import sys import unicodedata from morphgnt.utils import load_yaml, sorted_items derivation = load_yaml("derivation.yaml") lexemes = load_yaml("lexemes.yaml") def strip_accents(s): return "".join((c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")) def diff(word1, word2): result = "" state = 0 add = "" sub = "" for x in ndiff(strip_accents(lexeme), strip_accents(other)): if state == 0: if x[:2] == " ": result += "." state = 1 elif x[:2] == "- ": sub += x[2:] state = 2
#!/usr/bin/env python3 import sys from morphgnt.utils import load_yaml, sorted_items lexemes = load_yaml("lexemes.yaml") full_citation = load_yaml("../greek-vocab-assessment/headwords.txt") n_missed = [] non_n_found = [] used = [] for lexeme, metadata in sorted_items(lexemes): print("{}:".format(lexeme)) def q(metadata_name): if metadata_name in metadata: print(" {}: {}".format(metadata_name, metadata[metadata_name])) q("pos") if "full-citation-form" in metadata: print(" full-citation-form: {}".format( metadata["full-citation-form"])) else: if lexeme in full_citation: print(" full-citation-form: {}".format(full_citation[lexeme])) used.append(lexeme) if metadata["pos"] != "N": non_n_found.append(lexeme)
#!/usr/bin/env python3 import sys from morphgnt.utils import load_yaml, sorted_items lexemes = load_yaml("lexemes.yaml") full_citation = load_yaml("../greek-vocab-assessment/headwords.txt") n_missed = [] non_n_found = [] used = [] for lexeme, metadata in sorted_items(lexemes): print("{}:".format(lexeme)) def q(metadata_name): if metadata_name in metadata: print( " {}: {}".format( metadata_name, metadata[metadata_name] ) ) q("pos") if "full-citation-form" in metadata: print( " full-citation-form: {}".format( metadata["full-citation-form"]
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") danker = load_yaml("../data-cleanup/danker-concise-lexicon/danker_headwords.yaml") missing_danker = load_wordset("missing_danker.txt") problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") q("bdag-headword") if "danker-entry" in metadata: print " {}: {}".format("danker-entry", metadata["danker-entry"].encode("utf-8")) else: if lexeme in missing_danker: skipped += 1 else: if lexeme in danker: entry = danker[lexeme]
#!/usr/bin/env python3 import re import unicodedata from morphgnt.utils import load_yaml, sorted_items from citation_form_data import CITATION_FORMS lexemes = load_yaml("../../lexemes.yaml") ACUTE = u"\u0301" GRAVE = u"\u0300" CIRCUMFLEX = u"\u0342" def strip_accents(w): return "".join( unicodedata.normalize( "NFC", "".join(component for component in unicodedata.normalize("NFD", ch) if component not in [ACUTE, GRAVE, CIRCUMFLEX])) for ch in w) DODSON_OVERRIDES = { "ἀφθορία": "N:F", "δοκιμασία": "N:F", "εἰδέα": "N:F", "οἰκετεία": "N:F", "ὀλιγοπιστία": "N:F",
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items lexemes = load_yaml("lexemes.yaml") danker = load_yaml( "../data-cleanup/danker-concise-lexicon/danker_headwords.yaml") missing_danker = load_wordset("missing_danker.txt") problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") q("bdag-headword") if "danker-entry" in metadata: print " {}: {}".format("danker-entry", metadata["danker-entry"].encode("utf-8")) else: if lexeme in missing_danker: