class Commentary: def __init__(self, **kwargs): self.commentator = kwargs['en_title'] self.he_commentator = kwargs['he_title'] self.book = kwargs['book'] self.he_book = kwargs['he_book'] self.book_id = kwargs['bid'] self.hilchot_id = kwargs['HilchotId'] self.ja = JaggedTextArray() @classmethod def build_from_row(cls, row): if row['HilchotId'] == 2: raise CommentaryError en_title, he_title = cls.book_titles_from_row(row) init_args = { 'en_title': row['en_title'], 'he_title': row['name'], 'book': en_title, 'he_book': he_title, 'bid': row['bid'], 'HilchotId': row['HilchotId'], } return cls(**init_args) @staticmethod def book_titles_from_row(row): mishneh_torah = 'משנה תורה' full_title = f'{mishneh_torah}, {row["Hilchot"]}' en_title = sef_obj.Ref(full_title).normal() return en_title, full_title def is_part_of_commentary(self, row): return (self.book_id, self.hilchot_id) == (row['bid'], row['HilchotId']) def add_segment(self, segment: str, indices: tuple) -> None: final_index = self.ja.sub_array_length(indices) if final_index is None: final_index = 0 indices = indices + (final_index, ) self.ja.set_element(indices, segment) def add_segments_from_row(self, row): segments = self.build_segments(row['text']) indices = self.get_indices_for_row(row) for segment in segments: self.add_segment(segment, indices) @staticmethod def get_ja(title, he_title) -> dict: ja = sef_obj.JaggedArrayNode() ja.add_primary_titles(title, he_title) ja.add_structure(['Chapter', 'Halakhah', 'Comment']) ja.validate() return ja.serialize() def generate_index(self) -> dict: title, he_title = f'{self.commentator} on {self.book}', f'{self.he_commentator} על {self.he_book}' return { 'title': title, 'categories': self.get_category(), 'dependence': 'Commentary', 'base_text_titles': [self.book], 'schema': self.get_ja(title, he_title), 'collective_title': self.commentator, 'base_text_mapping': 'many_to_one' } def build_version(self) -> dict: return { 'versionTitle': 'Friedberg Edition', 'versionSource': 'https://fjms.genizah.org', 'language': 'he', 'text': self.ja.array() } @staticmethod def build_segments(segment: str) -> list: segment_xml = '<root>{}</root>'.format(segment) segment_soup = BeautifulSoup(segment_xml, 'xml') segment_root = segment_soup.root # clear out multiple classes - we're only interested in the last letter in the class for span in segment_root.find_all('span'): klass = span.get('class', '') if klass and isinstance(klass, list): span['class'] = span['class'][-1] # consolidate duplicate tags and unwrap meaningless tags for span in segment_root.find_all('span'): previous = span.previous_sibling if not previous: continue # make sure all text inside spans end with a space, we'll remove duplicates later if span.string: span.string.replace_with( NavigableString(' {}'.format(span.string))) if span.get('class', '') == '': span.unwrap() elif span.name == previous.name and span.get( 'class') == previous.get('class'): previous.append(span) span.unwrap() # handle footnotes while True: marker = segment_root.find('span', attrs={'class': 'R'}) note_tag = segment_root.find('span', attrs={'class': 'N'}) if marker and note_tag: marker.name = 'sup' del marker['class'] note_text = note_tag.text note_text = re.sub(r'^{}\s'.format(re.escape(marker.text)), '', note_text) new_note = segment_soup.new_tag('i') new_note['class'] = 'footnote' new_note.string = note_text marker.insert_after(new_note) note_tag.decompose() else: break markup = segment_root.find_all('span', class_=re.compile('[BZS]')) for b in markup: if b['class'] == 'S': b.name = 'small' elif b['class'] == 'Z': b.name = 'quote' else: b.name = 'b' del b['class'] segment_text = segment_root.decode_contents() segment_text = re.sub(r'^\s+|\s+$', '', segment_text) segment_text = re.sub(r'\s{2,}', ' ', segment_text) segment_text = re.sub(r'\s*<br/>\s*', '<br/>', segment_text) segment_text = re.sub(r'\s*(<br/>)+$', '', segment_text) # break on quotes which immediately follow a break broken_segments = re.split(r'<br/>(?=<quote>)', segment_text) broken_segments = [ re.sub(r'quote', 'b', seg) for seg in broken_segments ] return broken_segments @staticmethod def get_indices_for_row(row) -> tuple: def adjust(value: int): return value - 1 if value > 0 else value return adjust(row['PerekId']), adjust(row['HalachaId']) def get_term_data(self) -> tuple: return self.commentator, self.he_commentator def get_category(self) -> tuple: rambam_index = sef_obj.library.get_index(self.book) return ('Halakhah', 'Mishneh Torah', 'Commentary', self.commentator, rambam_index.categories[-1])