def convert(book_num): "Convert the given file to an NLTK tagged corpus file." if not os.path.exists('sblgnt-corpus'): os.mkdir('sblgnt-corpus') # Use the built-in filename function, but omit the leading path book_path = pysblgnt.morphgnt_filename(book_num).split("/")[1] out_path = 'sblgnt-corpus/' + book_path.rsplit('-', 1)[0] print("Converting " + out_path) tokens = [] for line in pysblgnt.morphgnt_rows(book_num): pos = line["ccat-pos"].strip('-') parse = line["ccat-parse"].replace('-', '') tag = pos if len(parse) > 0: tag += '-' + parse token = line["word"] + '/' + tag tokens.append(token) # Deal with punctuation if line["text"][-1] in punctuation: punct = line["text"][-1] p_token = punct + '/' + punct tokens.append(p_token) if punct != ",": tokens.append("\n") text = ' '.join(tokens) g = open(out_path, 'w') g.write(text) g.close()
def rows_by_verses_by_chapters_for_book(book_num): last_chapter = 0 chapters = [] verses = None rows = None for row in morphgnt_rows(book_num): c = int(row["bcv"][2:4]) v = int(row["bcv"][4:6]) if c != last_chapter: if verses: verses[1].append(rows) chapters.append(verses) chapter = ref.Chapter(book_num, c) verses = (chapter, []) rows = None last_chapter = c last_verse = 0 if v != last_verse: if rows: verses[1].append(rows) rows = (chapter.verse(v), []) last_verse = v rows[1].append(row) verses[1].append(rows) chapters.append(verses) return chapters
def rows_for_verse(verse): book_num, chapter_num, verse_num = verse.tup rows = [] for row in morphgnt_rows(book_num): c = int(row["bcv"][2:4]) v = int(row["bcv"][4:6]) if (c, v) == (chapter_num, verse_num): rows.append(row) return rows
def calc_freqs(data, tokens, stopwords): """Create a frequency map out of the new testament freqs key: lexical form of word value: frequency count """ row_queue = [] row_queue_len = max(token.max_size() for token in tokens) all_books = data[0] all_books['freqs'] = collections.Counter() for i in range(27): book = data[i + 1] book['freqs'] = collections.Counter() for row in pysblgnt.morphgnt_rows(i + 1): lex = row['lemma'] if lex in stopwords: continue all_books['freqs'][lex] += 1 book['freqs'][lex] += 1 # See if we have finished a token row_queue.append(row) if len(row_queue) > row_queue_len: row_queue.pop(0) for token in tokens: rows = token.matches(row_queue) if rows: text = [] for match_row in rows: sublex = match_row['lemma'] all_books['freqs'][sublex] -= 1 book['freqs'][sublex] -= 1 text.append(match_row['text']) all_books['freqs'][token.name] += 1 book['freqs'][token.name] += 1 break
def rows_by_verses_for_chapter(chapter): book_num, chapter_num = chapter.tup last_verse = 0 verses = [] rows = None for row in morphgnt_rows(book_num): c = int(row["bcv"][2:4]) v = int(row["bcv"][4:6]) if c == chapter_num: if v != last_verse: if rows: verses.append(rows) rows = (chapter.verse(v), []) last_verse = v rows[1].append(row) verses.append(rows) return verses
ginflexion = GreekInflexion(args.stemming, args.lexicon) debug = False incorrect_count = 0 total_count = 0 IGNORE_LIST = [ "κουμ", "εφφαθα", "σαβαχθάνι", "θά", ] for book_num in args.books: for row in morphgnt_rows(book_num): b, c, v = bcv_tuple(row["bcv"]) if row["ccat-pos"] == "V-": total_count += 1 lemma = row["lemma"] key = convert_parse(row["ccat-parse"]) form = row["norm"] # need to just do this in MorphGNT itself if key in ["AAO.3P", "PAO.3P"]: form = form.replace("(ν)", "ν") if lemma in IGNORE_LIST: continue
def get_morphgnt(verses): """ yield entries from MorphGNT for the given verses. verses is a list of verse-ranges where a verse-range is either a single verse-id or a tuple (start-verse-id, end-verse-id). A verse-id is the BBCCVV (book-chapter-verse) code used in the first column of MorphGNT. e.g. [("012801", "012815")] will yield Matthew 28:1-15. """ for verse_range in verses: if isinstance(verse_range, (list, tuple)): start, end = verse_range else: start = end = verse_range yield ("VERSE_RANGE_START", (start, end)) start_book, start_chapter, start_verse = bcv_tuple(start) end_book, end_chapter, end_verse = bcv_tuple(end) state = 0 # 0 = not started, 1 = in progress, 2 = ended for book_num in range(start_book, end_book + 1): yield ("BOOK_START", book_num) prev_chapter = prev_verse = None for row in morphgnt_rows(book_num): b, c, v = bcv_tuple(row["bcv"]) if state == 0: if (start_book, start_chapter, start_verse) == (b, c, v): state = 1 else: continue if (end_book, end_chapter) == (b, c) and end_verse < v: state = 2 break if end_book == b and end_chapter < c: state = 2 break if c != prev_chapter: if prev_chapter: if prev_verse: yield ("VERSE_END", prev_verse) yield ("CHAPTER_END", prev_chapter) yield ("CHAPTER_START", c) prev_chapter = c prev_verse = None if v != prev_verse: if prev_verse: yield ("VERSE_END", prev_verse) yield ("VERSE_START", v) prev_verse = v yield ("WORD", row) if state == 2: yield ("VERSE_END", prev_verse) yield ("CHAPTER_END_PARTIAL", prev_chapter) yield ("BOOK_END_PARTIAL", book_num) break yield ("VERSE_END", v) yield ("CHAPTER_END", c) yield ("BOOK_END", book_num) yield ("VERSE_RANGE_END", (start, end))
"χρή", ] LEXICON_FILE = "lexicons/morphgnt.yaml" ENDINGS_FILE = "stemming.yaml" lexicon = Lexicon(LEXICON_FILE) endings = Endings(ENDINGS_FILE) if __name__ == "__main__": for book_num in range(1, 28): for row in morphgnt_rows(book_num): ccat_pos = row["ccat-pos"] ccat_parse = row["ccat-parse"] form = row["norm"] lemma = row["lemma"] if ccat_pos != "V-": continue if lemma in IGNORE_LIST: continue if ccat_parse[3] == "N": parse = ccat_parse[1:4] elif ccat_parse[3] == "P": parse = ccat_parse[1:4] + "." + ccat_parse[4:7]
from collections import defaultdict from pysblgnt import morphgnt_rows from morphgnt_utils import bcv_tuple BOOK_NUM = 4 def convert_parse(ccat_parse): return ccat_parse[4:7] NOMINALS = defaultdict(lambda: defaultdict(set)) for row in morphgnt_rows(BOOK_NUM): b, c, v = bcv_tuple(row["bcv"]) if c == 2 and v <= 11: if row["ccat-pos"] in ["N-", "A-"]: NOMINALS[row["lemma"]][convert_parse(row["ccat-parse"])].add( row["norm"] + " # " + row["robinson"]) for lemma in NOMINALS: print("-") print(" lemma: {}".format(lemma)) print(" test_length: false") print() for parse, form in NOMINALS[lemma].items(): print(" {}: {}".format(parse, "/".join(form))) print()
def analyses(book_num): for row in morphgnt_rows(book_num): text = row["text"] text1 = re.sub("[⸀⸂⸁⸄⸃\[\]\(\)⟦⟧12]", "", text) text2 = text1.lower() word = row["word"] norm = row["norm"] if (word, norm) in INCONSISTENCIES: continue capitalized = ( text1[0] == text2[0].upper() and norm[0] != norm[0].upper() ) parenthetical = (text[0] == "(") word = word.lower() norm = norm.lower() if norm.endswith("(ν)"): if strip_accents(word) == strip_accents(norm[:-3]): norm = norm[:-3] elif strip_accents(word) == strip_accents(norm[:-3]) + "ν": norm = norm[:-3] + "ν" if norm.endswith("(ς)"): if strip_accents(word) == strip_accents(norm[:-3]): norm = norm[:-3] elif strip_accents(word) == strip_accents(norm[:-3]) + "ς": norm = norm[:-3] + "ς" diff = (word != norm) elision = (word, norm) in ELISION movable = (word, norm) in MOVABLE if has_grave(word) and grave_to_acute(word) == norm: final_grave = True word2 = grave_to_acute(word) else: final_grave = False word2 = word if count_accents(word) > 1 and strip_last_accent(word) == norm: extra_accent = True word2 = strip_last_accent(word) else: extra_accent = False word2 = word if norm in PROCLITICS: proclitic = True if word != norm and strip_accents(word) == norm: proclitic_extra_accent = True else: proclitic_extra_accent = False else: proclitic = False proclitic_extra_accent = False if norm in ENCLITICS: enclitic = True if word != norm and strip_last_accent(norm) == word: enclitic_lost_accent = True else: enclitic_lost_accent = False if norm in DISSYLLABIC_ENCLITICS: dissyllabic_enclitic = True else: dissyllabic_enclitic = False else: enclitic = False enclitic_lost_accent = False dissyllabic_enclitic = False if word in ["οὐκ", "καὶ", "τοῦτ’", "ἀλλ’", "εἰ"]: pre_esti_exception = True else: pre_esti_exception = False if "\u0301" not in unicodedata.normalize("NFD", word2) and \ "\u0342" not in unicodedata.normalize("NFD", word2): if final_grave: accent_type = "1A" else: accent_type = "--" else: accent_type2 = get_accent_type(word2) accent_type = str(accent_type2[0]) + \ {"\u0301": "A", "\u0342": "C"}[accent_type2[1]] enclitic_extra_accent = False paroxytone_esti = False if word in ["ἔστι", "ἔστιν"]: paroxytone_esti = True accent_type = "??" # temporary: will change below elif proclitic_extra_accent: accent_type = "##" elif norm in UNACCENTED_FOREIGN: assert norm == strip_accents(norm) accent_type = "UF" else: if not elision and not movable and not final_grave and \ not extra_accent and not enclitic_lost_accent: if diff: assert enclitic enclitic_extra_accent = True elif word in INDEF_ENCLITICS and word != strip_accents(word): enclitic_extra_accent = True if norm == strip_accents(norm): assert proclitic or enclitic punc = None if text2 != word: if text2[0] == "—": assert text2[1:] == word elif text2[-2:] in [";—", ".—", ",—"]: assert text2[:-2] == word punc = text2[-2] else: assert text2[:-1] == word, (text2, word) punc = text2[-1] yield { "diff": diff, "capitalized": capitalized, "parenthetical": parenthetical, "proclitic": proclitic, "enclitic": enclitic, "dissyllabic_enclitic": dissyllabic_enclitic, "elision": elision, "movable": movable, "final_grave": final_grave, "extra_accent": extra_accent, "proclitic_extra_accent": proclitic_extra_accent, "enclitic_extra_accent": enclitic_extra_accent, "enclitic_lost_accent": enclitic_lost_accent, "pre_esti_exception": pre_esti_exception, "paroxytone_esti": paroxytone_esti, "punc": punc, "accent_type": accent_type, "word": word, "norm": norm, "row": row, }
def get_morphgnt(verses): """ yield entries from MorphGNT for the given verses. verses is a list of verse-ranges where a verse-range is either a single verse-id or a tuple (start-verse-id, end-verse-id). A verse-id is the BBCCVV (book-chapter-verse) code used in the first column of MorphGNT. e.g. [("012801", "012815")] will yield Matthew 28:1-15. """ for verse_range in verses: if isinstance(verse_range, (list, tuple)): start, end = verse_range else: start = end = verse_range yield("VERSE_RANGE_START", (start, end)) start_book, start_chapter, start_verse = bcv_tuple(start) end_book, end_chapter, end_verse = bcv_tuple(end) state = 0 # 0 = not started, 1 = in progress, 2 = ended for book_num in range(start_book, end_book + 1): yield("BOOK_START", book_num) prev_chapter = prev_verse = None for row in morphgnt_rows(book_num): b, c, v = bcv_tuple(row["bcv"]) if state == 0: if (start_book, start_chapter, start_verse) == (b, c, v): state = 1 else: continue if (end_book, end_chapter) == (b, c) and end_verse < v: state = 2 break if end_book == b and end_chapter < c: state = 2 break if c != prev_chapter: if prev_chapter: if prev_verse: yield("VERSE_END", prev_verse) yield("CHAPTER_END", prev_chapter) yield("CHAPTER_START", c) prev_chapter = c prev_verse = None if v != prev_verse: if prev_verse: yield("VERSE_END", prev_verse) yield("VERSE_START", v) prev_verse = v yield ("WORD", row) if state == 2: yield("VERSE_END", prev_verse) yield("CHAPTER_END_PARTIAL", prev_chapter) yield("BOOK_END_PARTIAL", book_num) break yield("VERSE_END", v) yield("CHAPTER_END", c) yield("BOOK_END", book_num) yield("VERSE_RANGE_END", (start, end))