Ejemplo n.º 1
0
    def process_book(self, mishnah_title):

        bavli = TalmudVolume(re.sub(" ", "_", mishnah_title[8:]))
        mishnah = MishnahVolume(re.sub(" ", "_", mishnah_title))

        perek_start = True
        mishnayot_end = False
        while bavli.has_more():
            current_mishnah = mishnah.get_current_mishnah() if not mishnayot_end else ""

            (starting_daf, starting_line, line) = bavli.get_next_line()

            m = self.matni_re.match(line)
            if m or perek_start:  # Match mishnah keyword
                self.log.write(u"Found Mishnah start at {}:{}\n{}\n".format(starting_daf, starting_line, line))
                ending_daf = starting_daf
                ending_line = starting_line

                if perek_start and not m:  # Perek starts with no "Mishna" - contents of `line` are fine
                    pass
                else:
                    if len(m.group(2)) == 0:  # bareword "Mishna" - get next line
                        (ending_daf, ending_line, line) = bavli.get_next_line()
                    elif len(m.group(2)) > 0: # "Mishna" followed by content, twim off "Mishna"
                        line = m.group(2)

                line = self.replace_roshei_tevot(line)
                if fuzz.partial_ratio(line, current_mishnah) > 60:
                    self.log.write(u"Matched a starting line in the Mishnah: {}\n{}\n".format(line, current_mishnah))

                    mishnah_line_match_length, mishnah_line_match_threshold, max_lines = self.get_match_thresholds(bavli.title, mishnah.current_chapter)

                    starting_mishnah = mishnah.current_mishnah
                    if mishnayot_end:
                        msg = u"Error: Found too many mishnayot in {} {}!\n".format(bavli.title, mishnah.current_chapter)
                        print msg
                        self.log.write(msg)
                        self.error_log.write(msg)
                        self.csv_writer.writerow([bavli.title, mishnah.current_chapter, u"?", u"?", starting_daf, starting_line])

                    lines_in_match = 1
                    while not self.gemarah_re.search(line) and u'\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' not in line:  # Check for 'Gemara' or 'Hadran'
                        (foo, bar, line) = bavli.get_next_line()
                        lines_in_match += 1

                    lines_to_get = max_lines if max_lines <= lines_in_match else lines_in_match
                    (ending_daf, ending_line, last_bavli_segment) = bavli.get_previous_lines(lines_to_get)
                    last_bavli_segment = last_bavli_segment.strip()[-mishnah_line_match_length:]

                    # Open up Roshei Teivot
                    last_bavli_segment = self.replace_roshei_tevot(last_bavli_segment)

                    ending_mishnah = None
                    for i in range(mishnah.number_left_in_chapter() + 1):
                        m = mishnah.get_next_mishnah(i)
                        assert len(last_bavli_segment) < len(m)
                        (ratio, offset_start, offset_ending) = fuzz.partial_with_place(m, last_bavli_segment)
                        if ratio < mishnah_line_match_threshold:
                            mesg = u"Failed to match last Talmud line to Mishnah: \n{}\n{}\n\n".format(last_bavli_segment, m)
                            self.log.write(mesg)
                            continue
                        self.log.write(u"Succeeded to match last Talmud line to Mishanh: \n{}\n{}\n\n".format(last_bavli_segment, m))
                        ending_mishnah = mishnah.current_mishnah + i
                        if offset_ending < len(m) - self.end_of_mishnah_fudge_character_length:  # Match ended in middle of a mishnah.  Number at end is close-enough-to-end fudge factor.
                            mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah, offset_ending + 1)
                            self.log.write(u"Internal match {} in {}\n - advanced Mishnah offset: {}, {}, {}\n".format(last_bavli_segment, m, mishnah.current_chapter, ending_mishnah, offset_ending + 1))
                        else:  # match ended at end of a mishnah
                            if i == mishnah.number_left_in_chapter():  # if this is the last mishnah
                                self.log.write(u"Reached end of mishnayot in chapter {}.\n".format(str(mishnah.current_chapter)))
                                mishnayot_end = True
                            else:
                                mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah + 1)
                                self.log.write(u"Advanced to next Mishnah: {}, {}\n".format(mishnah.current_chapter, ending_mishnah + 1))
                        break
                    match = [bavli.title, mishnah.current_chapter, starting_mishnah, ending_mishnah, starting_daf, starting_line, ending_daf, ending_line]

                    if ending_mishnah is None:
                        msg = u"saw unmatched Mishna in Talmud: {}\n".format(", ".join([str(m) for m in match]))
                        self.log.write(msg)
                    else:
                        self.csv_writer.writerow(match)
                        self.matched_count += 1
                        msg = u"Match! {}\n".format(", ".join([str(m) for m in match]))
                        #print msg
                        self.log.write(msg)
                else:
                    self.log.write(u"Talmud Mishna start: {}\n - did not match next Mishna: {}".format(line, current_mishnah))
            if perek_start:
                perek_start = False

            # Check for Hadran
            if u'\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' in line:
                if not mishnayot_end:
                    msg = u'Error: Mishna did not reach the end of chapter! ("{}", {}).\t{} remain.'.format(bavli.title, mishnah.current_chapter, mishnah.number_left_in_chapter() + 1)
                    print msg
                    self.log.write(msg + u"\n")
                    self.error_log.write(msg + u"\n")
                    for n in mishnah.remaining_mishnah_numbers():
                        self.unmatched_count += 1
                        self.csv_writer.writerow([bavli.title, mishnah.current_chapter, n])
                self.log.write(u"End of perek: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1]))
                try:
                    # Advance to next chapter, reset indicators
                    next_chapter = self.get_next_bavli_chapter(bavli.title, mishnah.current_chapter)
                    mishnah.advance_pointer(next_chapter)
                    perek_start = True
                    mishnayot_end = False
                except PointerException:
                    self.log.write(u"End of book: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1]))
                    break
Ejemplo n.º 2
0
    def process_book(self, mishnah_title):

        bavli = TalmudVolume(re.sub(" ", "_", mishnah_title[8:]))
        mishnah = MishnahVolume(re.sub(" ", "_", mishnah_title))

        perek_start = True
        mishnayot_end = False
        while bavli.has_more():
            current_mishnah = mishnah.get_current_mishnah(
            ) if not mishnayot_end else ""

            (starting_daf, starting_line, line) = bavli.get_next_line()

            m = self.matni_re.match(line)
            if m or perek_start:  # Match mishnah keyword
                self.log.write("Found Mishnah start at {}:{}\n{}\n".format(
                    starting_daf, starting_line, line))
                ending_daf = starting_daf
                ending_line = starting_line

                if perek_start and not m:  # Perek starts with no "Mishna" - contents of `line` are fine
                    pass
                else:
                    if len(m.group(
                            2)) == 0:  # bareword "Mishna" - get next line
                        (ending_daf, ending_line, line) = bavli.get_next_line()
                    elif len(
                            m.group(2)
                    ) > 0:  # "Mishna" followed by content, twim off "Mishna"
                        line = m.group(2)

                line = self.replace_roshei_tevot(line)
                if fuzz.partial_ratio(line, current_mishnah) > 60:
                    self.log.write(
                        "Matched a starting line in the Mishnah: {}\n{}\n".
                        format(line, current_mishnah))

                    mishnah_line_match_length, mishnah_line_match_threshold, max_lines = self.get_match_thresholds(
                        bavli.title, mishnah.current_chapter)

                    starting_mishnah = mishnah.current_mishnah
                    if mishnayot_end:
                        msg = "Error: Found too many mishnayot in {} {}!\n".format(
                            bavli.title, mishnah.current_chapter)
                        print(msg)
                        self.log.write(msg)
                        self.error_log.write(msg)
                        self.csv_writer.writerow([
                            bavli.title, mishnah.current_chapter, "?", "?",
                            starting_daf, starting_line
                        ])

                    lines_in_match = 1
                    while not self.gemarah_re.search(
                            line
                    ) and '\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' not in line:  # Check for 'Gemara' or 'Hadran'
                        (foo, bar, line) = bavli.get_next_line()
                        lines_in_match += 1

                    lines_to_get = max_lines if max_lines <= lines_in_match else lines_in_match
                    (ending_daf, ending_line, last_bavli_segment
                     ) = bavli.get_previous_lines(lines_to_get)
                    last_bavli_segment = last_bavli_segment.strip(
                    )[-mishnah_line_match_length:]

                    # Open up Roshei Teivot
                    last_bavli_segment = self.replace_roshei_tevot(
                        last_bavli_segment)

                    ending_mishnah = None
                    for i in range(mishnah.number_left_in_chapter() + 1):
                        m = mishnah.get_next_mishnah(i)
                        assert len(last_bavli_segment) < len(m)
                        (ratio, offset_start,
                         offset_ending) = fuzz.partial_with_place(
                             m, last_bavli_segment)
                        if ratio < mishnah_line_match_threshold:
                            mesg = "Failed to match last Talmud line to Mishnah: \n{}\n{}\n\n".format(
                                last_bavli_segment, m)
                            self.log.write(mesg)
                            continue
                        self.log.write(
                            "Succeeded to match last Talmud line to Mishanh: \n{}\n{}\n\n"
                            .format(last_bavli_segment, m))
                        ending_mishnah = mishnah.current_mishnah + i
                        if offset_ending < len(
                                m
                        ) - self.end_of_mishnah_fudge_character_length:  # Match ended in middle of a mishnah.  Number at end is close-enough-to-end fudge factor.
                            mishnah.advance_pointer(mishnah.current_chapter,
                                                    ending_mishnah,
                                                    offset_ending + 1)
                            self.log.write(
                                "Internal match {} in {}\n - advanced Mishnah offset: {}, {}, {}\n"
                                .format(last_bavli_segment, m,
                                        mishnah.current_chapter,
                                        ending_mishnah, offset_ending + 1))
                        else:  # match ended at end of a mishnah
                            if i == mishnah.number_left_in_chapter(
                            ):  # if this is the last mishnah
                                self.log.write(
                                    "Reached end of mishnayot in chapter {}.\n"
                                    .format(str(mishnah.current_chapter)))
                                mishnayot_end = True
                            else:
                                mishnah.advance_pointer(
                                    mishnah.current_chapter,
                                    ending_mishnah + 1)
                                self.log.write(
                                    "Advanced to next Mishnah: {}, {}\n".
                                    format(mishnah.current_chapter,
                                           ending_mishnah + 1))
                        break
                    match = [
                        bavli.title, mishnah.current_chapter, starting_mishnah,
                        ending_mishnah, starting_daf, starting_line,
                        ending_daf, ending_line
                    ]

                    if ending_mishnah is None:
                        msg = "saw unmatched Mishna in Talmud: {}\n".format(
                            ", ".join([str(m) for m in match]))
                        self.log.write(msg)
                    else:
                        self.csv_writer.writerow(match)
                        self.matched_count += 1
                        msg = "Match! {}\n".format(", ".join(
                            [str(m) for m in match]))
                        #print msg
                        self.log.write(msg)
                else:
                    self.log.write(
                        "Talmud Mishna start: {}\n - did not match next Mishna: {}"
                        .format(line, current_mishnah))
            if perek_start:
                perek_start = False

            # Check for Hadran
            if '\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' in line:
                if not mishnayot_end:
                    msg = 'Error: Mishna did not reach the end of chapter! ("{}", {}).\t{} remain.'.format(
                        bavli.title, mishnah.current_chapter,
                        mishnah.number_left_in_chapter() + 1)
                    print(msg)
                    self.log.write(msg + "\n")
                    self.error_log.write(msg + "\n")
                    for n in mishnah.remaining_mishnah_numbers():
                        self.unmatched_count += 1
                        self.csv_writer.writerow(
                            [bavli.title, mishnah.current_chapter, n])
                self.log.write("End of perek: {} {} on {} {}\n".format(
                    bavli.title, mishnah.current_chapter,
                    bavli.get_current_line()[0],
                    bavli.get_current_line()[1]))
                try:
                    # Advance to next chapter, reset indicators
                    next_chapter = self.get_next_bavli_chapter(
                        bavli.title, mishnah.current_chapter)
                    mishnah.advance_pointer(next_chapter)
                    perek_start = True
                    mishnayot_end = False
                except PointerException:
                    self.log.write("End of book: {} {} on {} {}\n".format(
                        bavli.title, mishnah.current_chapter,
                        bavli.get_current_line()[0],
                        bavli.get_current_line()[1]))
                    break
Ejemplo n.º 3
0
def process_book(bavli, mishnah, csv_writer):
    perek_start = True
    mishnayot_end = False
    while bavli.has_more():
        current_mishnah = mishnah.get_current_mishnah() if not mishnayot_end else ""

        (starting_daf, starting_line, line) = bavli.get_next_line()
        m = matni_re.match(line)
        if m or perek_start:  # Match mishnah keyword
            log.write(u"Found Mishnah start at {}:{}\n{}\n".format(starting_daf, starting_line, line))
            if perek_start or len(m.group(2)) > 6:
                ending_daf = starting_daf
                ending_line = starting_line
                if m:
                    line = m.group(2)
            else:
                (ending_daf, ending_line, line) = bavli.get_next_line()
            if fuzz.partial_ratio(line, current_mishnah) > 60:
                log.write(u"Matched a starting line in the Mishnah: {}\n{}\n".format(line, current_mishnah))
                starting_mishnah = mishnah.current_mishnah
                if mishnayot_end:
                    msg = u"Error: Found too many mishnayot in {} {}!\n".format(bavli.title, mishnah.current_chapter)
                    print msg
                    log.write(msg)
                    error_log.write(msg)
                while not gemarah_re.search(line):
                    (foo, bar, line) = bavli.get_next_line()

                (ending_daf, ending_line, previous_line) = bavli.get_previous_line()
                (foo, bar, previous_previous_line) = bavli.get_previous_line(2)
                last_bavli_segment = previous_previous_line.strip() + u" " + previous_line.strip()
                last_bavli_segment = last_bavli_segment[-30:-1]
                ending_mishnah = None
                for i in range(mishnah.number_left_in_chapter() + 1):
                    m = mishnah.get_next_mishnah(i)
                    assert len(last_bavli_segment) < len(m)
                    (ratio, offset_start, offset_ending) = fuzz.partial_with_place(m, last_bavli_segment)
                    if ratio < 60:
                        log.write(u"Failed to match last Talmud line to Mishnah: {}\n{}\n".format(last_bavli_segment, m))
                        error_log.write(u"Failed to match last Talmud line to Mishnah: {}\n{}\n".format(last_bavli_segment, m))
                        continue
                    log.write(u"Succeeded to match last Talmud line to Mishanh: {}\n{}\n".format(last_bavli_segment, m))
                    ending_mishnah = mishnah.current_mishnah + i
                    if offset_ending < len(m) - 10:  # Match ended in middle of a mishnah.  Number at end is close-enough-to-end fudge factor.
                        mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah, offset_ending + 1)
                        log.write(u"Advanced Mishnah offset: {}, {}, {}\n".format(mishnah.current_chapter, ending_mishnah, offset_ending + 1))
                    else:  # match ended at end of a mishnah
                        if i == mishnah.number_left_in_chapter():  # if this is the last mishnah
                            log.write(u"Reached end of mishnayot in chapter {} is {}\n".format(str(mishnah.current_chapter), str(len(mishnah.get_current_chapter_text()))))
                            mishnayot_end = True
                        else:
                            mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah + 1)
                            log.write(u"Advanced to next Mishnah: {}, {}\n".format(mishnah.current_chapter, ending_mishnah + 1))
                    break
                match = [bavli.title, mishnah.current_chapter, starting_mishnah, ending_mishnah, starting_daf, starting_line, ending_daf, ending_line]

                if ending_mishnah is None:
                    msg = u"saw unmatched Mishna in Talmud: {}\n".format(", ".join([str(m) for m in match]))
                    print msg
                    log.write(msg)
                    error_log.write(msg)
                else:
                    print
                    csv_writer.writerow(match)
                    msg = u"Match! {}\n".format(", ".join([str(m) for m in match]))
                    print msg
                    log.write(msg)

        if perek_start == True:
            perek_start = False

        if u'\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' in line:
            if mishnayot_end == False:
                msg = u"Error: Mishna did not reach the end of chapter! {} {}\n".format(mishnah.title,mishnah.current_chapter)
                print msg
                log.write(msg)
            log.write(u"End of perek: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1]))
            try:
                mishnah.advance_pointer(mishnah.current_chapter + 1)
                perek_start = True
                mishnayot_end = False
            except PointerException:
                log.write(u"End of book: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1]))
                break