#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_bdag = load_wordset("missing_bdag.txt") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") if "bdag-headword" in metadata: print " bdag-headword: {}".format(
import sys from pyuca import Collator collator = Collator() from morphgnt.utils import load_yaml from morphgnt.utils import nfkc_normalize as n danker = load_yaml("../data-cleanup/danker-concise-lexicon/components.yaml") greenlee = {} with open("../data-cleanup/greenlee-morphology/morphemes-utf8.txt") as f: for line in f: key, value = line.strip().split("\t") greenlee[n(key.decode("utf-8")).split(",")[0]] = { "full-entry": n(key.decode("utf-8")), "components": n(value.decode("utf-8")), } words = [n(word) for word in set(danker.keys()).union(set(greenlee.keys()))] count = 0 for word in sorted(words, key=collator.sort_key): count += 1 print "{}:".format(word.encode("utf-8")) if word in danker: print " danker-full-entry: \"{}\"".format(danker[word]["full-entry"].encode("utf-8")) print " danker-components: \"{}\"".format(danker[word]["components"].encode("utf-8")) if word in greenlee: print " greenlee-full-entry: \"{}\"".format(greenlee[word]["full-entry"].encode("utf-8"))
from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_dodson = load_wordset("missing_dodson.txt") dodson = defaultdict(list) with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f: for line in f: strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode( "utf-8").split("\t") head_word = n(greek.split(",")[0]) dodson[head_word].append({ "strongs": strongs, "gk": gk, "pos": pos, "greek": n(greek), "short-gloss": short_gloss, "long-gloss": long_gloss }) not_in_dodson = set() for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata:
from morphgnt.utils import load_yaml, sorted_items, load_wordset from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_mounce = load_wordset("missing_mounce.txt") problems = [] skipped = 0 mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt" ) as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") mounce[int(gk.split("?")[0])].append(n(greek)) for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) return True q("pos") q("bdag-headword") q("danker-entry") q("dodson-entry")
#!/usr/bin/env python import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_bdag = load_wordset("missing_bdag.txt") headwords = set() with open("../data-cleanup/bdag-headwords/bdag_headwords.txt") as f: for line in f: headwords.add(n(line.strip().decode("utf-8"))) existing_not_in_headwords = [] missing_not_in_headwords = [] added = [] for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) q("pos") if "bdag-headword" in metadata: print " bdag-headword: {}".format(metadata["bdag-headword"].encode("utf-8")) if metadata["bdag-headword"] not in headwords:
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_dodson = load_wordset("missing_dodson.txt") dodson = defaultdict(list) with open("../data-cleanup/dodson-lexicon/dodson_lexicon.txt") as f: for line in f: strongs, gk, pos, greek, short_gloss, long_gloss = line.strip().decode("utf-8").split("\t") head_word = n(greek.split(",")[0]) dodson[head_word].append( { "strongs": strongs, "gk": gk, "pos": pos, "greek": n(greek), "short-gloss": short_gloss, "long-gloss": long_gloss, } ) not_in_dodson = set() for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8"))
import sys from morphgnt.utils import load_yaml, sorted_items, load_wordset from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_mounce = load_wordset("missing_mounce.txt") problems = [] skipped = 0 mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") mounce[int(gk.split("?")[0])].append(n(greek)) for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) return True q("pos") q("bdag-headword") q("danker-entry") q("dodson-entry") if not q("mounce-headword"):
#!/usr/bin/env python from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_morphcat = load_wordset("missing_morphcat.txt") mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt") as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") greek = n(greek) mounce[greek].append({ "gk": gk, "morphcat": morphcat, }) problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format(metadata_name, unicode(metadata[metadata_name]).encode("utf-8")) return True
from collections import defaultdict import sys from morphgnt.utils import load_yaml, load_wordset, sorted_items from morphgnt.utils import nfkc_normalize as n lexemes = load_yaml("lexemes.yaml") missing_morphcat = load_wordset("missing_morphcat.txt") mounce = defaultdict(list) with open("../data-cleanup/mounce-morphcat/mounce-tauber-morphcat-utf8.txt" ) as f: for line in f: gk, greek, morphcat = line.strip().decode("utf-8").split(":") greek = n(greek) mounce[greek].append({ "gk": gk, "morphcat": morphcat, }) problems = [] skipped = 0 for lexeme, metadata in sorted_items(lexemes): print "{}:".format(lexeme.encode("utf-8")) def q(metadata_name): if metadata_name in metadata: print " {}: {}".format( metadata_name, unicode(metadata[metadata_name]).encode("utf-8"))