def hagahot_alignment(ja_smk, ja_raph, ja_hagahot): ja_smk = JaggedArray(ja_smk) ja_raph = JaggedArray(ja_raph) ja_hagahot = JaggedArray(ja_hagahot) # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())): dict_lst = [] dict = {u'siman': [], u'smk': [], u'raph': []} for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())): # print numToHeb(i+1) dict['siman'] = numToHeb(i + 1) for i, smk_line in enumerate(seg[0]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line) if hag_lett: dict['smk'].extend([(hag_l, i + 1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] # print 'RAPH' for i, raph_line in enumerate(seg[1]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line) if hag_lett: dict['raph'].extend([(hag_l, i + 1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] dict_lst.append(dict) dict = {u'siman': [], u'smk': [], u'raph': []} return dict_lst
def parse_unlinked(self): parsed = JaggedArray([[[]]]) comment_counter = Counter() for chapter in self.get_chapter(): chap_num = chapter.num for phrase in chapter.get_phrase(): phrase_num = phrase.subchap if phrase_num is None: raise AttributeError( u'Unlabeled phrase in {} chapter {}'.format( self.get_author(), chap_num)) comment_number = comment_counter[(chap_num, phrase_num)] parsed.set_element( [int(chap_num) - 1, int(phrase_num) - 1, comment_number], phrase.as_string()) comment_counter[(chap_num, phrase_num)] += 1 unlinkedCommentStore.append({ 'commentator': commentatorNames[self.get_author()], 'chapter': chap_num, 'verse': phrase_num, 'order': str(comment_number + 1) }) return parsed.array()
def parse_linked(self): parsed = JaggedArray([[[]]]) for phrase in self.get_phrase(): indices = (commentStore[phrase.id]['chapter'], commentStore[phrase.id]['verse'], commentStore[phrase.id]['order']) text = phrase.get_comment().valueOf_.replace(u'\n', u'') parsed.set_element([i-1 for i in indices], text) return parsed.array()
def parse_linked(self): parsed = JaggedArray([[[]]]) for phrase in self.get_phrase(): indices = (commentStore[phrase.id]['chapter'], commentStore[phrase.id]['verse'], commentStore[phrase.id]['order']) text = phrase.get_comment().valueOf_.replace(u'\n', u' ') text = re.sub(u' +', u' ', text) text = re.sub(ur' (:|\.)', ur'\1', text) parsed.set_element([i - 1 for i in indices], text) return parsed.array()
def parse(filename): comment_store = populate_comment_store(filename) parsed = JaggedArray([[]]) with open(filename) as infile: soup = BeautifulSoup(infile, 'xml') footnotes = soup.find_all('ftnote') for footnote in footnotes: loc = comment_store.get(footnote.attrs['id']) if loc is None: continue value = u''.join([unicode(child) for child in footnote.children]) parsed.set_element([loc['chapter']-1, loc['verse']-1], structure_comments(value), pad=[]) return parsed.array()
def parse(filename): with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() targum_ja = JaggedArray([[[]]]) indices = None for line_num, line in enumerate(lines): if (line_num + 1) % 2 == 1: indices = ref_to_indices(line) else: text_value = u' '.join(line.split(u' ')[1:]) targum_ja.set_element(indices, text_value) return targum_ja.array()
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot): ja_smk = JaggedArray(ja_smk) ja_raph = JaggedArray(ja_raph) ja_hagahot = JaggedArray(ja_hagahot) # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())): dict_lst = [] dict = {u'siman':[], u'smk':[], u'raph':[]} for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())): # print numToHeb(i+1) dict['siman'] = numToHeb(i+1) for i, smk_line in enumerate(seg[0]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line) if hag_lett: dict['smk'].extend([(hag_l, i+1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] # print 'RAPH' for i, raph_line in enumerate(seg[1]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line) if hag_lett: dict['raph'].extend([(hag_l, i+1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] dict_lst.append(dict) dict = {u'siman': [], u'smk': [], u'raph': []} return dict_lst
def parse_file(filename): with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() jagged_array = JaggedArray([[[]]]) segment = RambamSegment() for line in lines: if segment.is_quote(line): segment.add_raw_quote(line) elif segment.is_text(line): segment.add_text(line) segment.add_segment(jagged_array) return {'parsed text': jagged_array.array(), 'links': segment.extract_links()}
def restructure_text(): with open('Derech Chaim text.json') as infile: version = json.load(infile) my_text = version['text'][u''] pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$') parsed = JaggedArray([[[]]]) for chap_index, chapter in enumerate(my_text): current_mishnah, current_comment = 0, 0 for line in chapter: match = pattern.search(line) if match is None: # This is a regular comment parsed.set_element([chap_index, current_mishnah, current_comment], line, pad=[]) current_comment += 1 else: m_value = getGematria(match.group(1)) - 1 if m_value > current_mishnah: # This condition allows for intro text to appear before first mishnah mark current_mishnah = m_value current_comment = 0 return parsed.array()
def parse_unlinked(self): parsed = JaggedArray([[[]]]) comment_counter = Counter() for chapter in self.get_chapter(): chap_num = chapter.num for phrase in chapter.get_phrase(): phrase_num = phrase.subchap if phrase_num is None: raise AttributeError(u'Unlabeled phrase in {} chapter {}'.format(self.get_author(), chap_num)) comment_number = comment_counter[(chap_num, phrase_num)] parsed.set_element([int(chap_num)-1, int(phrase_num)-1, comment_number], phrase.as_string()) comment_counter[(chap_num, phrase_num)] += 1 unlinkedCommentStore.append({ 'commentator': commentatorNames[self.get_author()], 'chapter': chap_num, 'verse': phrase_num, 'order': str(comment_number+1) }) return parsed.array()
def parse_shokets(): with open('chesed_le-avraham.htm') as infile: soup = BeautifulSoup(infile, 'html.parser') raw_shokets = soup.find('div', class_='shokets').text.splitlines() raw_shokets = filter(lambda x: x if len(x) > 0 else None, raw_shokets) pattern = ur'(\u05d4\u05e9\u05d5?\u05e7\u05ea [\u05d0-\u05ea]{1,2})( - (.*))?:' parsed = JaggedArray([[]]) shoket, paragraph = -1, -1 for line in raw_shokets: new_section = re.search(pattern, line) if new_section is None: if shoket >= 0: paragraph += 1 parsed.set_element([shoket, paragraph], line) else: shoket += 1 paragraph = -1 if new_section.group(3) is not None: paragraph += 1 parsed.set_element([shoket, paragraph], u'<b>{}</b>'.format(new_section.group(3))) return parsed.array()
def restructure_text(): with open('Derech Chaim text.json') as infile: version = json.load(infile) my_text = version['text'][u''] pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$') parsed = JaggedArray([[[]]]) for chap_index, chapter in enumerate(my_text): current_mishnah, current_comment = 0, 0 for line in chapter: match = pattern.search(line) if match is None: # This is a regular comment parsed.set_element( [chap_index, current_mishnah, current_comment], line, pad=[]) current_comment += 1 else: m_value = getGematria(match.group(1)) - 1 if m_value > current_mishnah: # This condition allows for intro text to appear before first mishnah mark current_mishnah = m_value current_comment = 0 return parsed.array()
def jaggedarray_from_files(input_file, footnote_file): """ :param input_file: Main text file to parse :param footnote_file: Footnote text file to parase :return: A 3D jaggedArray of text from files. """ ja = JaggedArray([[]]) global footnotes global footnotes_parasha global link_refs link_refs = [] current = [] list_of_currents = [] footnotes = [] footnotes_parasha = {} links = [] text = codecs.open(footnote_file, 'r', 'utf-8') for line in text: footnotes.append(cleanup(line)) text.close() footnotes = iter(footnotes) main_text = codecs.open(input_file, 'r', 'utf-8') for line in main_text: if line.startswith('@22'): while current: list_of_currents.append(current) current = [] m = re.search(u'([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2}), ([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2})', line) # if with semicolon, choose first pasuk ignore second location = Ref(u"".join([u"בראשית ", m.group(1), u": ", m.group(2)])) link_refs.append(location) current.append(footnotify(u''.join([u"<strong>", cleanup(line), u"</strong>"]))) elif line.startswith('@88'): current[-1] += u''.join([u"<sup>*</sup><i class='footnote'>", cleanup(line), u"</i>", "<br>___________<br>"]) elif line.startswith('@11') or line.startswith('@33'): current.append(cleanup(footnotify(line))) elif line.startswith('@00'): #move line is None to own condition while current: list_of_currents.append(current) current = [] while list_of_currents: for x in list_of_currents: i = list_of_currents.index(x) location = [link_refs[i].sections[0] - 1, link_refs[i].sections[1] - 1] # if they start on same verse, append array to previous array if link_refs[i].sections[0] == link_refs[i - 1].sections[0] and link_refs[i].sections[1] == link_refs[i - 1].sections[1]: bereshit_ref = link_refs[i].normal() philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":", str(len(ja.get_element([location[0], location[1]]))+1), "-", str(len(ja.get_element([location[0], location[1]]))+len(x))]) #above line: base first on last number of element len(ja.get_element([location[0], locationo[1]])) links.append((bereshit_ref, philo_ref)) ja.get_element([location[0], location[1]]).extend(repeat_footnotify(x)) else: bereshit_ref = link_refs[i].normal() philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":1-", str(len(x))]) links.append((bereshit_ref, philo_ref)) ja.set_element([location[0], location[1]], repeat_footnotify(x), pad = []) footnotes_parasha.clear() current = [] link_refs = [] list_of_currents = [] main_text.close() #util.ja_to_xml(ja.array(), ['Chapter', 'Verse','Comment']) return ja.array(), links