Ejemplo n.º 1
0
from flexpy.Corpus import Corpus
from flexpy.tags.RtLexEntry import RtLexEntry
from flexpy.LexEntry import LexEntry
from flexpy.FlexPyUtil import get_tone_letters_from_string

project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
project_name = "IxpantepecMixtec"

corpus = Corpus(project_dir, project_name, include_punctuation=False)
tag_dict = corpus.tag_dict

# desired result:
# list of lexical items, each place it occurs in 1st position
# and each place it occurs in 2nd position
# show baseline tone pattern and perturbed tone pattern of both
# look at this big list to see what jumps out

by_lex_guid = {}

for text in corpus.texts:
    # print(f"current text is {text}")

    # iterate over MORPHEMES in the text (not across paragraph boundaries)
    # know the LexEntry that each of them belongs to
    if text.paragraphs is None:
        # print(f"skipping text {text} due to lack of paragraphs")
        continue
    for pg in text.paragraphs:
        # print("\n---- new paragraph ----\n")
        # print(f"current paragraph is: {pg}")
        # print(f"run texts is: {pg.run_texts}")
Ejemplo n.º 2
0
        "Yangulam",
    ]
    print("found {} languages".format(len(language_names)))
    project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
    lexicons = {}
    wordlist_dir = "/home/wesley/flexpy/flexpy/language_data/MadangWordlists/"
    for ln in language_names:
        print("-- loading language {}".format(ln))
        filename = "{}.tsv".format(ln)
        fp = os.path.join(wordlist_dir, filename)
        if os.path.exists(fp):
            lexicon = load_lexicon_from_tsv(fp, ln)
            lexicons[ln] = lexicon
        else:
            print("creating tsv for lexicon of {}".format(ln))
            corpus = Corpus(project_dir, ln, include_punctuation=False)
            write_lexicon_tsv(corpus, fp)
            assert os.path.exists(fp)  # should be there now that we wrote it
            lexicon = load_lexicon_from_tsv(fp, ln)
            lexicons[ln] = lexicon

        print("{} has {} lexeme entries".format(ln, len(lexicon.lexemes)))

    test_show_similarity(lexicons)
    lexicon1, lexicon2 = random.sample(list(lexicons.values()), 2)
    glosses = [
        "man",
        "woman",
        "canoe",
        "sun",
        "red",