def get_perakim(type, tag, tag_reg): """ :param type: identifies if this is Mishnah, yachin or boaz :param tag: the tag to identify the start of a new perek :param tag_reg: regular expression for the tag :return: a dictionary, keys are the tractate, values are a list of perakim """ # get a list of all tractates tractates = library.get_indexes_in_category('Mishnah') results = {} for tractate in tractates: ref = Ref(tractate) name = ref.he_book() name = name.replace(u'משנה', type) file_name = u'{}.txt'.format(name) # if file doesn't exist, skip if not os.path.isfile(file_name): continue text_file = codecs.open(file_name, 'r', 'utf-8') data_tag = TagTester(tag, text_file, tag_reg, name) results[name] = data_tag.grab_by_section() text_file.close()
def checkDappim(files): errors = open('daf_issues.txt', 'w') for file in files: print file flagged = [] errors.write("\n" + file + "\n") reg = u'@22\[?[\u05d0-\u05ea\s"]+\]?' open_file = open(file + ".txt") tt = TagTester("@22", open_file, reg=reg) num_array, string_array = tt.daf_processor() prev_value = 2 for count, this_value in enumerate(num_array): if this_value - prev_value <= 0: flagged.append(string_array[count]) prev_value = this_value errors.write("Flagged mistakes: " + "\n") flagged_str = "" for each_one in flagged: flagged_str += each_one.replace("\n", "").replace("@22", "") + ", " errors.write(flagged_str.encode('utf-8') + "\n") errors.write("All Dappim in this Masechet: " + "\n") dappim_str = "" for each_one in string_array: dappim_str += each_one.replace("\n", "").replace("@22", "") + ", " errors.write(dappim_str.encode('utf-8') + "\n") errors.close()
def compare_mishna_to_yachin(tractate_list): for tractate in tractate_list: r = Ref(tractate) name = r.he_book() m_name = name.replace(u'משנה', u'משניות') y_name = name.replace(u'משנה', u'יכין') output = codecs.open('tag_match_up.txt', 'a', 'utf-8') try: m_file = codecs.open(u'{}.txt'.format(m_name), 'r', 'utf-8') y_file = codecs.open(u'{}.txt'.format(y_name), 'r', 'utf-8') except IOError: output.write(u'missing file {}\n'.format(name)) continue m_tag = TagTester(u'@44', m_file, name=m_name) y_tag = TagTester(u'@11', y_file, name=y_name) seg_tag = u'@00(?:פרק |פ)([א-ת,"]{1,3})' m_tag.segment_tag = seg_tag y_tag.segment_tag = seg_tag compare_tags_to_comments(m_tag, y_tag, output) m_file.close() y_file.close() output.close()
def check_segments(): segments = [] infile = codecs.open(filename, 'r', 'utf-8') headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header() tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)') while not tester.eof: segments.append( tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1)) infile.close() for sec_number, section in enumerate(segments): index = 1 for title in section: title = title.replace(u'"', u'') count = util.getGematria(title) if count != index: print headers[sec_number - 1] print util.numToHeb(index) index = count index += 1
def checkDappim(files): errors = open('daf_issues.txt', 'w') for file in files: print file flagged = [] errors.write("\n"+file+"\n") reg = u'@22\[?[\u05d0-\u05ea\s"]+\]?' open_file = open(file+".txt") tt = TagTester("@22", open_file, reg=reg) num_array, string_array = tt.daf_processor() prev_value = 2 for count, this_value in enumerate(num_array): if this_value - prev_value <= 0: flagged.append(string_array[count]) prev_value = this_value errors.write("Flagged mistakes: "+"\n") flagged_str = "" for each_one in flagged: flagged_str += each_one.replace("\n","").replace("@22", "")+", " errors.write(flagged_str.encode('utf-8')+"\n") errors.write("All Dappim in this Masechet: "+"\n") dappim_str = "" for each_one in string_array: dappim_str += each_one.replace("\n", "").replace("@22", "")+", " errors.write(dappim_str.encode('utf-8')+"\n") errors.close()
def test_insert_chapters(filename, expected): with codecs.open(filename, 'r', 'utf-8') as infile: tester = TagTester(u'@22', infile, u'^@22\u05d0( |$)') if len(tester.grab_each_header()) == expected: return True else: return False
def check_segments(): segments = [] infile = codecs.open(filename, 'r', 'utf-8') headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header() tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)') while not tester.eof: segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1)) infile.close() for sec_number, section in enumerate(segments): index = 1 for title in section: title = title.replace(u'"', u'') count = util.getGematria(title) if count != index: print headers[sec_number-1] print util.numToHeb(index) index = count index += 1
def checkPerakim(): files = ["Yoma"] for file in files: print file reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file) tt = TagTester("@00", open_file, reg=reg) tt.in_order_one_section(1, perek_checker)
def checkPerakim(): for file in glob.glob(u"*.txt"): if file.find("intro") == -1: print file reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file) tt = TagTester("@00", open_file, perek_checker, reg=reg) tt.in_order_one_section(1)
def checkMishnayot(): for file in glob.glob(u"*.txt"): if file.find("intro") == -1: print file reg = u'@22.*?[\u05d0-\u05ea]+.*?' open_file = open(file) tt = TagTester("@22", open_file, perek_checker, reg=reg) tt.in_order_many_sections(end_tag="@00")
def checkPerakim(files): for file in files: if file == "Berakhot": continue print file reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file+".txt") tt = TagTester("@00", open_file, reg=reg) result = tt.in_order_one_section(1, perek_checker) print result
def checkPerakim(): for file in glob.glob(u"*.txt"): if file.find("intro") == -1: print file reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file) tt = TagTester("@00", open_file, reg=reg) result = tt.in_order_one_section(1) if result[0] != "SUCCESS": pdb.set_trace()
def checkPerakim(files): for file in files: if file == "Berakhot": continue print file reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file + ".txt") tt = TagTester("@00", open_file, reg=reg) result = tt.in_order_one_section(1, perek_checker) print result
def checkMishnayot(): for file in glob.glob(u"*.txt"): if file.find("intro") == -1: print file reg = u'@22.*?[\u05d0-\u05ea]+.*?' open_file = open(file) tt = TagTester("@22", open_file, reg=reg) result = tt.in_order_many_sections(end_tag="@00") if result[0] != "SUCCESS": pdb.set_trace()
def check_chapters(): cards = get_cards() good_files, bad_files = [], [] for card in cards: m_ref = Ref(card.replace('Rambam ', '')) with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile: tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7') tags = tester.grab_each_header() if len(tags) == len( m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot': good_files.append(card) else: bad_files.append(card) return {'good': good_files, 'bad': bad_files}
def check_chapters(): with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch: test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})') index = 1 for header in test.grab_each_header(capture_group=1): header = header.replace(u'"', u'') count = util.getGematria(header) if count != index: print util.numToHeb(index) index = count index += 1
def check_chapters(): cards = get_cards() good_files, bad_files = [], [] for card in cards: m_ref = Ref(card.replace('Rambam ', '')) with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile: tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7') tags = tester.grab_each_header() if len(tags) == len(m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot': good_files.append(card) else: bad_files.append(card) return { 'good': good_files, 'bad': bad_files }
def check_tags_on_category(category, tag, tag_regex, check_function): """ Check that all the tags in category run in order :param category: משניות, יכין or whatever is needed to identify the files """ output = codecs.open(u'{}_tags.txt'.format(category), 'w', 'utf-8') seg_reg = u'@00(?:פרק |פ)([א-ת,"]{1,3})' for tractate in tractates: ref = Ref(tractate) name = ref.he_book() name = name.replace(u'משנה', category) try: in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8') except IOError: output.write(u'{}.txt does not exist\n'.format(name)) continue # create TagTester object for each file tag_object = TagTester(tag, in_file, tag_regex, name) # get tags in array whole_book = get_tags_by_perek(tag_object, seg_reg, 1) perfect = True for index, perek in enumerate(whole_book): message = u'{} פרק {}'.format(name, index + 1) if not check_function(perek, message, output): perfect = False if perfect: output.write(u'{}-אין בעיות\n'.format(name)) output.close()
def tag_matches_regex(exact_tag, expression, output_file_name): """ Boaz tags are all over the place. Given a tag, make sure all appearances of a tag can be grabbed by regular expression. :param exact_tag: Exact string defining a tag (e.g. @00) :param expression: Regular expression with which to grab the tag :param output_file_name: file to write results """ results = codecs.open(output_file_name, 'w', 'utf-8') books = library.get_indexes_in_category('Mishnah') for book in books: name = Ref(book).he_book().replace(u'משנה', u'בועז') if not os.path.isfile(u'{}.txt'.format(name)): results.write(u'missing boaz {}\n'.format(book)) continue input_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8') tester = TagTester(exact_tag, input_file, expression) count = 0 for match in tester.types.keys(): count += tester.types[match] results.write(u'{} found {} issues\n'.format( name, tester.appearances - count)) input_file.close() results.close()
def get_TYT_perek_lengths(): TYT_lengths = {} for file in glob.glob(u"*.txt"): if file.find("intro") == -1: reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file) tt = TagTester("@00", open_file, reg=reg) TYT_perakim = tt.in_order_one_section(1) if TYT_perakim[0] == "SUCCESS": len_TYT_perakim = len(TYT_perakim[1]) if file.find("avot") >= 0: mishnah_name = "Pirkei Avot" else: mishnah_name = "Mishnah " + file.replace(".txt", "").title() mishnah_name = mishnah_name.replace("_", " ") TYT_lengths[mishnah_name] = len_TYT_perakim return TYT_lengths
def get_TYT_perek_lengths(): TYT_lengths = {} for file in glob.glob(u"*.txt"): if file.find("intro") == -1: reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)' open_file = open(file) tt = TagTester("@00", open_file, reg=reg) TYT_perakim = tt.in_order_one_section(1) if TYT_perakim[0] == "SUCCESS": len_TYT_perakim = len(TYT_perakim[1]) if file.find("avot") >= 0: mishnah_name = "Pirkei Avot" else: mishnah_name = "Mishnah "+file.replace(".txt", "").title() mishnah_name = mishnah_name.replace("_", " ") TYT_lengths[mishnah_name] = len_TYT_perakim return TYT_lengths
def check_mishnayot(): cards = get_cards() success, failure = [], [] for card in cards: with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile: tester = TagTester(u'@22', infile, u'@22([\u05d0-\u05ea]{1,2})') result = tester.in_order_many_sections(end_tag=u'@00', capture_group=1) if result[0] == 'SUCCESS': success.append(card) else: print 'failure: {}'.format(card) print len(result[1]) print 'successes: {}'.format(len(success)) print 'failures: {}'.format(len(failure)) print 'total: {}'.format(len(cards)) for item in failure: print item
def get_num_TYTs_per_perek(): num_TYTs = {} actual_TYTs = {} for file in glob.glob(u"*.txt"): if file.find("intro") == -1: reg = u'@22.*?[\u05d0-\u05ea]+.*?' open_file = open(file) tt = TagTester("@22", open_file, reg=reg) headers = tt.in_order_many_sections(end_tag="@00") if headers[0] == "SUCCESS": headers = headers[1] else: pdb.set_trace() if file.find("avot") >= 0: masechet = "Pirkei Avot" else: masechet = "Mishnah "+file.replace(".txt", "").replace("_"," ").title() num_TYTs[masechet] = [] actual_TYTs[masechet] = headers for perek in headers: num_TYTs[masechet].append(len(perek)) return num_TYTs, actual_TYTs
def tag_starts_line(tag, category): """ Make sure a tag always begins a new line :param tag: regular expression with which to find tag :param category: Identifier for the files (i.e משניות, יכין etc.) """ for tractate in tractates: ref = Ref(tractate) name = ref.he_book() name = name.replace(u'משנה', category) try: in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8') except IOError: print u'cannot find {}'.format(name) continue # instantiate TagTester tester = TagTester(tag, in_file) if tester.does_start_line(): print u'{} is okay!'.format(name) else: print u'problem with {}'.format(name)
def get_num_TYTs_per_perek(): num_TYTs = {} actual_TYTs = {} for file in glob.glob(u"*.txt"): if file.find("intro") == -1: reg = u'@22.*?[\u05d0-\u05ea]+.*?' open_file = open(file) tt = TagTester("@22", open_file, reg=reg) headers = tt.in_order_many_sections(end_tag="@00") if headers[0] == "SUCCESS": headers = headers[1] else: pdb.set_trace() if file.find("avot") >= 0: masechet = "Pirkei Avot" else: masechet = "Mishnah " + file.replace(".txt", "").replace( "_", " ").title() num_TYTs[masechet] = [] actual_TYTs[masechet] = headers for perek in headers: num_TYTs[masechet].append(len(perek)) return num_TYTs, actual_TYTs
def check_chapters(category, chap_reg): output = codecs.open('chapters.txt', 'w', 'utf-8') for tractate in tractates: ref = Ref(tractate) name = ref.he_book() name = name.replace(u'משנה', category) try: in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8') except IOError: output.write(u'{}.txt does not exist\n'.format(name)) continue chap_tag = TagTester(u'@00', in_file, chap_reg, name) chapters = get_tags_by_perek(chap_tag, chap_tag.reg, capture_group=1) if len(chapters) != len(ref.all_subrefs()): output.write(u'Chapter mismatch {}\n'.format(tractate)) output.close()
def test_accercy(tag, filename): with codecs.open(filename, 'r', 'utf-8') as fp: tag_tester = TagTester(tag, fp) appearences = tag_tester.appearances print appearences