def convert(book_num):
    "Convert the given file to an NLTK tagged corpus file."

    if not os.path.exists('sblgnt-corpus'):
        os.mkdir('sblgnt-corpus')

    # Use the built-in filename function, but omit the leading path
    book_path = pysblgnt.morphgnt_filename(book_num).split("/")[1]
    out_path = 'sblgnt-corpus/' + book_path.rsplit('-', 1)[0]
    print("Converting " + out_path)
    tokens = []

    for line in pysblgnt.morphgnt_rows(book_num):
        pos = line["ccat-pos"].strip('-')
        parse = line["ccat-parse"].replace('-', '')
        tag = pos
        if len(parse) > 0:
            tag += '-' + parse
        token = line["word"] + '/' + tag
        tokens.append(token)
        # Deal with punctuation
        if line["text"][-1] in punctuation:
            punct = line["text"][-1]
            p_token = punct + '/' + punct
            tokens.append(p_token)
            if punct != ",":
                tokens.append("\n")

    text = ' '.join(tokens)
    g = open(out_path, 'w')
    g.write(text)
    g.close()
def convert(book_num):
    "Convert the given file to an NLTK tagged corpus file."

    if not os.path.exists('sblgnt-corpus'):
        os.mkdir('sblgnt-corpus')

    # Use the built-in filename function, but omit the leading path
    book_path = pysblgnt.morphgnt_filename(book_num).split("/")[1]
    out_path = 'sblgnt-corpus/' + book_path.rsplit('-', 1)[0]
    print("Converting " + out_path)
    tokens = []

    for line in pysblgnt.morphgnt_rows(book_num):
        pos = line["ccat-pos"].strip('-')
        parse = line["ccat-parse"].replace('-', '')
        tag = pos
        if len(parse) > 0:
            tag += '-' + parse
        token = line["word"] + '/' + tag
        tokens.append(token)
        # Deal with punctuation
        if line["text"][-1] in punctuation:
            punct = line["text"][-1]
            p_token = punct + '/' + punct
            tokens.append(p_token)
            if punct != ",":
                tokens.append("\n")

    text = ' '.join(tokens)
    g = open(out_path, 'w')
    g.write(text)
    g.close()
Example #3
0
def rows_by_verses_by_chapters_for_book(book_num):

    last_chapter = 0
    chapters = []
    verses = None
    rows = None

    for row in morphgnt_rows(book_num):
        c = int(row["bcv"][2:4])
        v = int(row["bcv"][4:6])

        if c != last_chapter:

            if verses:
                verses[1].append(rows)
                chapters.append(verses)

            chapter = ref.Chapter(book_num, c)
            verses = (chapter, [])
            rows = None
            last_chapter = c
            last_verse = 0

        if v != last_verse:
            if rows:
                verses[1].append(rows)
            rows = (chapter.verse(v), [])
            last_verse = v

        rows[1].append(row)

    verses[1].append(rows)
    chapters.append(verses)

    return chapters
Example #4
0
def rows_for_verse(verse):
    book_num, chapter_num, verse_num = verse.tup

    rows = []

    for row in morphgnt_rows(book_num):
        c = int(row["bcv"][2:4])
        v = int(row["bcv"][4:6])

        if (c, v) == (chapter_num, verse_num):
            rows.append(row)

    return rows
Example #5
0
def calc_freqs(data, tokens, stopwords):
    """Create a frequency map out of the new testament

    freqs
        key: lexical form of word
        value: frequency count

    """
    row_queue = []
    row_queue_len = max(token.max_size() for token in tokens)
    all_books = data[0]
    all_books['freqs'] = collections.Counter()
    for i in range(27):
        book = data[i + 1]
        book['freqs'] = collections.Counter()
        for row in pysblgnt.morphgnt_rows(i + 1):
            lex = row['lemma']
            if lex in stopwords:
                continue
            all_books['freqs'][lex] += 1
            book['freqs'][lex] += 1

            # See if we have finished a token
            row_queue.append(row)
            if len(row_queue) > row_queue_len:
                row_queue.pop(0)
            for token in tokens:
                rows = token.matches(row_queue)
                if rows:
                    text = []
                    for match_row in rows:
                        sublex = match_row['lemma']
                        all_books['freqs'][sublex] -= 1
                        book['freqs'][sublex] -= 1
                        text.append(match_row['text'])
                    all_books['freqs'][token.name] += 1
                    book['freqs'][token.name] += 1
                    break
Example #6
0
def rows_by_verses_for_chapter(chapter):
    book_num, chapter_num = chapter.tup

    last_verse = 0
    verses = []
    rows = None

    for row in morphgnt_rows(book_num):
        c = int(row["bcv"][2:4])
        v = int(row["bcv"][4:6])

        if c == chapter_num:
            if v != last_verse:

                if rows:
                    verses.append(rows)
                rows = (chapter.verse(v), [])
                last_verse = v

            rows[1].append(row)

    verses.append(rows)

    return verses
Example #7
0
ginflexion = GreekInflexion(args.stemming, args.lexicon)

debug = False

incorrect_count = 0
total_count = 0

IGNORE_LIST = [
    "κουμ",
    "εφφαθα",
    "σαβαχθάνι",
    "θά",
]

for book_num in args.books:
    for row in morphgnt_rows(book_num):
        b, c, v = bcv_tuple(row["bcv"])
        if row["ccat-pos"] == "V-":
            total_count += 1

            lemma = row["lemma"]
            key = convert_parse(row["ccat-parse"])
            form = row["norm"]

            # need to just do this in MorphGNT itself
            if key in ["AAO.3P", "PAO.3P"]:
                form = form.replace("(ν)", "ν")

            if lemma in IGNORE_LIST:
                continue
Example #8
0
def get_morphgnt(verses):
    """
    yield entries from MorphGNT for the given verses.

    verses is a list of verse-ranges where a verse-range is either a single
    verse-id or a tuple (start-verse-id, end-verse-id). A verse-id is the
    BBCCVV (book-chapter-verse) code used in the first column of MorphGNT.

    e.g. [("012801", "012815")] will yield Matthew 28:1-15.
    """
    for verse_range in verses:
        if isinstance(verse_range, (list, tuple)):
            start, end = verse_range
        else:
            start = end = verse_range

        yield ("VERSE_RANGE_START", (start, end))

        start_book, start_chapter, start_verse = bcv_tuple(start)
        end_book, end_chapter, end_verse = bcv_tuple(end)

        state = 0  # 0 = not started, 1 = in progress, 2 = ended

        for book_num in range(start_book, end_book + 1):

            yield ("BOOK_START", book_num)

            prev_chapter = prev_verse = None

            for row in morphgnt_rows(book_num):
                b, c, v = bcv_tuple(row["bcv"])
                if state == 0:
                    if (start_book, start_chapter, start_verse) == (b, c, v):
                        state = 1
                    else:
                        continue

                if (end_book, end_chapter) == (b, c) and end_verse < v:
                    state = 2
                    break

                if end_book == b and end_chapter < c:
                    state = 2
                    break

                if c != prev_chapter:
                    if prev_chapter:
                        if prev_verse:
                            yield ("VERSE_END", prev_verse)
                        yield ("CHAPTER_END", prev_chapter)
                    yield ("CHAPTER_START", c)
                    prev_chapter = c
                    prev_verse = None

                if v != prev_verse:
                    if prev_verse:
                        yield ("VERSE_END", prev_verse)
                    yield ("VERSE_START", v)
                    prev_verse = v

                yield ("WORD", row)

            if state == 2:
                yield ("VERSE_END", prev_verse)
                yield ("CHAPTER_END_PARTIAL", prev_chapter)
                yield ("BOOK_END_PARTIAL", book_num)
                break

            yield ("VERSE_END", v)
            yield ("CHAPTER_END", c)
            yield ("BOOK_END", book_num)

        yield ("VERSE_RANGE_END", (start, end))
    "χρή",
]


LEXICON_FILE = "lexicons/morphgnt.yaml"
ENDINGS_FILE = "stemming.yaml"


lexicon = Lexicon(LEXICON_FILE)
endings = Endings(ENDINGS_FILE)


if __name__ == "__main__":

    for book_num in range(1, 28):
        for row in morphgnt_rows(book_num):
            ccat_pos = row["ccat-pos"]
            ccat_parse = row["ccat-parse"]
            form = row["norm"]
            lemma = row["lemma"]

            if ccat_pos != "V-":
                continue

            if lemma in IGNORE_LIST:
                continue

            if ccat_parse[3] == "N":
                parse = ccat_parse[1:4]
            elif ccat_parse[3] == "P":
                parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
Example #10
0
from collections import defaultdict

from pysblgnt import morphgnt_rows

from morphgnt_utils import bcv_tuple

BOOK_NUM = 4


def convert_parse(ccat_parse):
    return ccat_parse[4:7]


NOMINALS = defaultdict(lambda: defaultdict(set))

for row in morphgnt_rows(BOOK_NUM):
    b, c, v = bcv_tuple(row["bcv"])
    if c == 2 and v <= 11:
        if row["ccat-pos"] in ["N-", "A-"]:
            NOMINALS[row["lemma"]][convert_parse(row["ccat-parse"])].add(
                row["norm"] + "  # " + row["robinson"])

for lemma in NOMINALS:
    print("-")
    print("    lemma: {}".format(lemma))
    print("    test_length: false")
    print()
    for parse, form in NOMINALS[lemma].items():
        print("    {}: {}".format(parse, "/".join(form)))
    print()
def analyses(book_num):

    for row in morphgnt_rows(book_num):
        text = row["text"]
        text1 = re.sub("[⸀⸂⸁⸄⸃\[\]\(\)⟦⟧12]", "", text)
        text2 = text1.lower()
        word = row["word"]
        norm = row["norm"]

        if (word, norm) in INCONSISTENCIES:
            continue

        capitalized = (
            text1[0] == text2[0].upper()
            and
            norm[0] != norm[0].upper()
        )

        parenthetical = (text[0] == "(")

        word = word.lower()
        norm = norm.lower()

        if norm.endswith("(ν)"):
            if strip_accents(word) == strip_accents(norm[:-3]):
                norm = norm[:-3]
            elif strip_accents(word) == strip_accents(norm[:-3]) + "ν":
                norm = norm[:-3] + "ν"

        if norm.endswith("(ς)"):
            if strip_accents(word) == strip_accents(norm[:-3]):
                norm = norm[:-3]
            elif strip_accents(word) == strip_accents(norm[:-3]) + "ς":
                norm = norm[:-3] + "ς"

        diff = (word != norm)

        elision = (word, norm) in ELISION
        movable = (word, norm) in MOVABLE

        if has_grave(word) and grave_to_acute(word) == norm:
            final_grave = True
            word2 = grave_to_acute(word)
        else:
            final_grave = False
            word2 = word

        if count_accents(word) > 1 and strip_last_accent(word) == norm:
            extra_accent = True
            word2 = strip_last_accent(word)
        else:
            extra_accent = False
            word2 = word

        if norm in PROCLITICS:
            proclitic = True
            if word != norm and strip_accents(word) == norm:
                proclitic_extra_accent = True
            else:
                proclitic_extra_accent = False
        else:
            proclitic = False
            proclitic_extra_accent = False

        if norm in ENCLITICS:
            enclitic = True
            if word != norm and strip_last_accent(norm) == word:
                enclitic_lost_accent = True
            else:
                enclitic_lost_accent = False
            if norm in DISSYLLABIC_ENCLITICS:
                dissyllabic_enclitic = True
            else:
                dissyllabic_enclitic = False
        else:
            enclitic = False
            enclitic_lost_accent = False
            dissyllabic_enclitic = False

        if word in ["οὐκ", "καὶ", "τοῦτ’", "ἀλλ’", "εἰ"]:
            pre_esti_exception = True
        else:
            pre_esti_exception = False

        if "\u0301" not in unicodedata.normalize("NFD", word2) and \
                "\u0342" not in unicodedata.normalize("NFD", word2):
            if final_grave:
                accent_type = "1A"
            else:
                accent_type = "--"
        else:
            accent_type2 = get_accent_type(word2)
            accent_type = str(accent_type2[0]) + \
                {"\u0301": "A", "\u0342": "C"}[accent_type2[1]]

        enclitic_extra_accent = False
        paroxytone_esti = False
        if word in ["ἔστι", "ἔστιν"]:
            paroxytone_esti = True
            accent_type = "??"  # temporary: will change below
        elif proclitic_extra_accent:
            accent_type = "##"
        elif norm in UNACCENTED_FOREIGN:
            assert norm == strip_accents(norm)
            accent_type = "UF"
        else:
            if not elision and not movable and not final_grave and \
                    not extra_accent and not enclitic_lost_accent:
                if diff:
                    assert enclitic
                    enclitic_extra_accent = True
                elif word in INDEF_ENCLITICS and word != strip_accents(word):
                    enclitic_extra_accent = True
            if norm == strip_accents(norm):
                assert proclitic or enclitic

        punc = None
        if text2 != word:
            if text2[0] == "—":
                assert text2[1:] == word
            elif text2[-2:] in [";—", ".—", ",—"]:
                assert text2[:-2] == word
                punc = text2[-2]
            else:
                assert text2[:-1] == word, (text2, word)
                punc = text2[-1]

        yield {
            "diff": diff,
            "capitalized": capitalized,
            "parenthetical": parenthetical,
            "proclitic": proclitic,
            "enclitic": enclitic,
            "dissyllabic_enclitic": dissyllabic_enclitic,
            "elision": elision,
            "movable": movable,
            "final_grave": final_grave,
            "extra_accent": extra_accent,
            "proclitic_extra_accent": proclitic_extra_accent,
            "enclitic_extra_accent": enclitic_extra_accent,
            "enclitic_lost_accent": enclitic_lost_accent,
            "pre_esti_exception": pre_esti_exception,
            "paroxytone_esti": paroxytone_esti,
            "punc": punc,
            "accent_type": accent_type,
            "word": word,
            "norm": norm,
            "row": row,
        }
Example #12
0
def get_morphgnt(verses):
    """
    yield entries from MorphGNT for the given verses.

    verses is a list of verse-ranges where a verse-range is either a single
    verse-id or a tuple (start-verse-id, end-verse-id). A verse-id is the
    BBCCVV (book-chapter-verse) code used in the first column of MorphGNT.

    e.g. [("012801", "012815")] will yield Matthew 28:1-15.
    """
    for verse_range in verses:
        if isinstance(verse_range, (list, tuple)):
            start, end = verse_range
        else:
            start = end = verse_range

        yield("VERSE_RANGE_START", (start, end))

        start_book, start_chapter, start_verse = bcv_tuple(start)
        end_book, end_chapter, end_verse = bcv_tuple(end)

        state = 0  # 0 = not started, 1 = in progress, 2 = ended

        for book_num in range(start_book, end_book + 1):

            yield("BOOK_START", book_num)

            prev_chapter = prev_verse = None

            for row in morphgnt_rows(book_num):
                b, c, v = bcv_tuple(row["bcv"])
                if state == 0:
                    if (start_book, start_chapter, start_verse) == (b, c, v):
                        state = 1
                    else:
                        continue

                if (end_book, end_chapter) == (b, c) and end_verse < v:
                    state = 2
                    break

                if end_book == b and end_chapter < c:
                    state = 2
                    break

                if c != prev_chapter:
                    if prev_chapter:
                        if prev_verse:
                            yield("VERSE_END", prev_verse)
                        yield("CHAPTER_END", prev_chapter)
                    yield("CHAPTER_START", c)
                    prev_chapter = c
                    prev_verse = None

                if v != prev_verse:
                    if prev_verse:
                        yield("VERSE_END", prev_verse)
                    yield("VERSE_START", v)
                    prev_verse = v

                yield ("WORD", row)

            if state == 2:
                yield("VERSE_END", prev_verse)
                yield("CHAPTER_END_PARTIAL", prev_chapter)
                yield("BOOK_END_PARTIAL", book_num)
                break

            yield("VERSE_END", v)
            yield("CHAPTER_END", c)
            yield("BOOK_END", book_num)

        yield("VERSE_RANGE_END", (start, end))