Ejemplo n.º 1
0
class Commentary:
    def __init__(self, **kwargs):
        self.commentator = kwargs['en_title']
        self.he_commentator = kwargs['he_title']
        self.book = kwargs['book']
        self.he_book = kwargs['he_book']
        self.book_id = kwargs['bid']
        self.hilchot_id = kwargs['HilchotId']
        self.ja = JaggedTextArray()

    @classmethod
    def build_from_row(cls, row):
        if row['HilchotId'] == 2:
            raise CommentaryError

        en_title, he_title = cls.book_titles_from_row(row)
        init_args = {
            'en_title': row['en_title'],
            'he_title': row['name'],
            'book': en_title,
            'he_book': he_title,
            'bid': row['bid'],
            'HilchotId': row['HilchotId'],
        }
        return cls(**init_args)

    @staticmethod
    def book_titles_from_row(row):
        mishneh_torah = 'משנה תורה'
        full_title = f'{mishneh_torah}, {row["Hilchot"]}'
        en_title = sef_obj.Ref(full_title).normal()
        return en_title, full_title

    def is_part_of_commentary(self, row):
        return (self.book_id, self.hilchot_id) == (row['bid'],
                                                   row['HilchotId'])

    def add_segment(self, segment: str, indices: tuple) -> None:
        final_index = self.ja.sub_array_length(indices)
        if final_index is None:
            final_index = 0
        indices = indices + (final_index, )
        self.ja.set_element(indices, segment)

    def add_segments_from_row(self, row):
        segments = self.build_segments(row['text'])
        indices = self.get_indices_for_row(row)
        for segment in segments:
            self.add_segment(segment, indices)

    @staticmethod
    def get_ja(title, he_title) -> dict:
        ja = sef_obj.JaggedArrayNode()
        ja.add_primary_titles(title, he_title)
        ja.add_structure(['Chapter', 'Halakhah', 'Comment'])
        ja.validate()
        return ja.serialize()

    def generate_index(self) -> dict:
        title, he_title = f'{self.commentator} on {self.book}', f'{self.he_commentator} על {self.he_book}'

        return {
            'title': title,
            'categories': self.get_category(),
            'dependence': 'Commentary',
            'base_text_titles': [self.book],
            'schema': self.get_ja(title, he_title),
            'collective_title': self.commentator,
            'base_text_mapping': 'many_to_one'
        }

    def build_version(self) -> dict:
        return {
            'versionTitle': 'Friedberg Edition',
            'versionSource': 'https://fjms.genizah.org',
            'language': 'he',
            'text': self.ja.array()
        }

    @staticmethod
    def build_segments(segment: str) -> list:
        segment_xml = '<root>{}</root>'.format(segment)
        segment_soup = BeautifulSoup(segment_xml, 'xml')
        segment_root = segment_soup.root

        # clear out multiple classes - we're only interested in the last letter in the class
        for span in segment_root.find_all('span'):
            klass = span.get('class', '')
            if klass and isinstance(klass, list):
                span['class'] = span['class'][-1]

        # consolidate duplicate tags and unwrap meaningless tags
        for span in segment_root.find_all('span'):
            previous = span.previous_sibling
            if not previous:
                continue

            # make sure all text inside spans end with a space, we'll remove duplicates later
            if span.string:
                span.string.replace_with(
                    NavigableString(' {}'.format(span.string)))

            if span.get('class', '') == '':
                span.unwrap()

            elif span.name == previous.name and span.get(
                    'class') == previous.get('class'):
                previous.append(span)
                span.unwrap()

        # handle footnotes
        while True:
            marker = segment_root.find('span', attrs={'class': 'R'})
            note_tag = segment_root.find('span', attrs={'class': 'N'})
            if marker and note_tag:
                marker.name = 'sup'
                del marker['class']
                note_text = note_tag.text
                note_text = re.sub(r'^{}\s'.format(re.escape(marker.text)), '',
                                   note_text)
                new_note = segment_soup.new_tag('i')
                new_note['class'] = 'footnote'
                new_note.string = note_text
                marker.insert_after(new_note)
                note_tag.decompose()
            else:
                break

        markup = segment_root.find_all('span', class_=re.compile('[BZS]'))
        for b in markup:
            if b['class'] == 'S':
                b.name = 'small'
            elif b['class'] == 'Z':
                b.name = 'quote'
            else:
                b.name = 'b'
            del b['class']

        segment_text = segment_root.decode_contents()
        segment_text = re.sub(r'^\s+|\s+$', '', segment_text)
        segment_text = re.sub(r'\s{2,}', ' ', segment_text)
        segment_text = re.sub(r'\s*<br/>\s*', '<br/>', segment_text)
        segment_text = re.sub(r'\s*(<br/>)+$', '', segment_text)

        # break on quotes which immediately follow a break
        broken_segments = re.split(r'<br/>(?=<quote>)', segment_text)
        broken_segments = [
            re.sub(r'quote', 'b', seg) for seg in broken_segments
        ]
        return broken_segments

    @staticmethod
    def get_indices_for_row(row) -> tuple:
        def adjust(value: int):
            return value - 1 if value > 0 else value

        return adjust(row['PerekId']), adjust(row['HalachaId'])

    def get_term_data(self) -> tuple:
        return self.commentator, self.he_commentator

    def get_category(self) -> tuple:
        rambam_index = sef_obj.library.get_index(self.book)
        return ('Halakhah', 'Mishneh Torah', 'Commentary', self.commentator,
                rambam_index.categories[-1])