def parse_unlinked(self): parsed = JaggedArray([[[]]]) comment_counter = Counter() for chapter in self.get_chapter(): chap_num = chapter.num for phrase in chapter.get_phrase(): phrase_num = phrase.subchap if phrase_num is None: raise AttributeError( u'Unlabeled phrase in {} chapter {}'.format( self.get_author(), chap_num)) comment_number = comment_counter[(chap_num, phrase_num)] parsed.set_element( [int(chap_num) - 1, int(phrase_num) - 1, comment_number], phrase.as_string()) comment_counter[(chap_num, phrase_num)] += 1 unlinkedCommentStore.append({ 'commentator': commentatorNames[self.get_author()], 'chapter': chap_num, 'verse': phrase_num, 'order': str(comment_number + 1) }) return parsed.array()
def parse_en(filename): with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() ja = JaggedArray([[[[]]]]) placing = u'(\s*[0-9]{1,2}),([0-9]{1,2})-?[0-9]*\.' # the regex to find the indexing on Monk # q1, q2 = ur'“', ur'”' # Rabbi Monk uses these to enclose translation of a pasuk # dh_reg = ur'([\u05d0 - \u05ea]*), *({}.*?{})'.format(q1, q2) replace_dict = {placing: u'', u'@': ''} temp = [] indices = [0] * 3 for line in lines: pasuk_dh = re.match(placing, line) reg_dh = re.search( ur'@([\u05d0-\u05ea|\\s]*)', line) # reg_dh = re.search(ur'([\u05d0-\u05ea]+, *“.*?”)',line) line = multiple_replace(line, replace_dict, using_regex=True) if pasuk_dh or reg_dh: temp = ' '.join(temp) ja.set_element(indices, temp, []) temp = [] if pasuk_dh: indices = [ int(pasuk_dh.group(1)) - 1, int(pasuk_dh.group(2)) - 1, indices[2] ] indices[2] = 0 elif reg_dh: indices[2] += 1 if not line.isspace() and not re.match( ur' *Parshat *(\S+) *(\S+)? *', line): # don't put into array names of Parasha or empty lines temp.append(line)
def parse(): with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile: lines = infile.readlines() gate, chapter, whole_text = -1, -1, [] root = JaggedArray([[]]) found_beginning = False beginning = re.compile(ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})') for line in lines: match = beginning.search(line) if match: if found_beginning: if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]): # strip out some unnecessary text root.set_element([gate, chapter], whole_text[1:], pad=[]) else: root.set_element([gate, chapter], whole_text, pad=[]) whole_text = [] else: found_beginning = True new_gate, new_chapter = getGematria(match.group(1))-1, getGematria(match.group(2))-1 if new_gate - gate > 1 or new_chapter - chapter > 1: print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1) gate, chapter = new_gate, new_chapter elif found_beginning: if re.search(ur'<img', line): whole_text[-1] = add_image(line, whole_text[-1]) continue
def parse_linked(self): parsed = JaggedArray([[[]]]) for phrase in self.get_phrase(): indices = (commentStore[phrase.id]['chapter'], commentStore[phrase.id]['verse'], commentStore[phrase.id]['order']) text = phrase.get_comment().valueOf_.replace(u'\n', u'') parsed.set_element([i-1 for i in indices], text) return parsed.array()
def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False): """ Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged Array in the desired structure (Chapter, verse, etc.) This function is a modulation of the origanal file_to_ja because it deals with gimatria letters so to place the correct chapters and segments in the currect places according to the hebrew letter numbering. Ofcourse it also puts in the padding where needed. (_g stands for Gimatria. :param depth: depth of the JaggedArray. :param infile: Text file to read from :param expressions: A list of regular expressions with which to identify section (chapter) level. Do not include an expression with which to break up the segment levels. :param cleaner: A function that takes a list of strings and returns an array with the text parsed correctly. Should also break up and remove unnecessary tagging data. :param grab_all: If set to true, will grab the lines indicating new sections. :return: A jagged_array with the text properly structured. """ # instantiate ja # structure = reduce(lambda x,y: [x], range(depth-1), []) # ja = JaggedArray(structure) ja = JaggedArray([]) # ensure there is a regex for every level except the lowest if depth - len(expressions) != 1: raise AttributeError('Not enough data to parse. Need {} expressions, ' 'received {}'.format(depth-1, len(expressions))) # compile regexes, instantiate index list regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions) temp = [] # loop through file for line in infile: # check for matches to the regexes for i, reg in enumerate(regexes): found = reg.search(line) if found: # check that we've hit the first chapter and verse if indices.count(-1) == 0: ja.set_element(indices, cleaner(temp), []) temp = [] if grab_all: temp.append(line) gimt = getGematria(found.group('gim')) if gimt != 0: indices[i] = gimt - 1 else: indices[i] += 1 indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]] break else: if indices.count(-1) == 0: temp.append(line) else: ja.set_element(indices, cleaner(temp), []) return ja
def parse_linked(self): parsed = JaggedArray([[[]]]) for phrase in self.get_phrase(): indices = (commentStore[phrase.id]['chapter'], commentStore[phrase.id]['verse'], commentStore[phrase.id]['order']) text = phrase.get_comment().valueOf_.replace(u'\n', u' ') text = re.sub(u' +', u' ', text) text = re.sub(ur' (:|\.)', ur'\1', text) parsed.set_element([i - 1 for i in indices], text) return parsed.array()
def parse(filename): comment_store = populate_comment_store(filename) parsed = JaggedArray([[]]) with open(filename) as infile: soup = BeautifulSoup(infile, 'xml') footnotes = soup.find_all('ftnote') for footnote in footnotes: loc = comment_store.get(footnote.attrs['id']) if loc is None: continue value = u''.join([unicode(child) for child in footnote.children]) parsed.set_element([loc['chapter']-1, loc['verse']-1], structure_comments(value), pad=[]) return parsed.array()
def parse_file(filename): with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() jagged_array = JaggedArray([[[]]]) segment = RambamSegment() for line in lines: if segment.is_quote(line): segment.add_raw_quote(line) elif segment.is_text(line): segment.add_text(line) segment.add_segment(jagged_array) return {'parsed text': jagged_array.array(), 'links': segment.extract_links()}
def parse(filename): with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() targum_ja = JaggedArray([[[]]]) indices = None for line_num, line in enumerate(lines): if (line_num + 1) % 2 == 1: indices = ref_to_indices(line) else: text_value = u' '.join(line.split(u' ')[1:]) targum_ja.set_element(indices, text_value) return targum_ja.array()
def parse(): with codecs.open("hebrew_or_neerav.html", "r", "windows-1255") as infile: lines = infile.readlines() gate, chapter, whole_text = -1, -1, [] root = JaggedArray([[]]) found_beginning = False next_line_subject = False subjects = [] main_pattern = re.compile(ur"^<b>חלק ([\u05d0-\u05ea]{1,2}) פרק ([\u05d0-\u05ea]{1,2})") for index, line in enumerate(lines): line = line.replace("(", "(<sub>") line = line.replace(")", "</sub>)") if next_line_subject == True: subjects.append(line) next_line_subject = False continue if line.find(u"חלק שביעי חלק הכינויים א") >= 0: return dealWithEnd(lines[index + 1], lines[index + 2 :], root, subjects) main_match = main_pattern.search(line) if main_match: if found_beginning: root.set_element([gate, chapter], whole_text, pad=[]) whole_text = [] else: found_beginning = True new_gate, new_chapter = getGematria(main_match.group(1)) - 1, getGematria(main_match.group(2)) - 1 if new_gate - gate > 1 or new_chapter - chapter > 1: print "skip found at Gate {} Chapter {}".format(new_gate + 1, new_chapter + 1) gate, chapter = new_gate, new_chapter elif found_beginning: if len(line.split(" ")) == 2 and line.find(u"חלק") >= 0: next_line_subject = True continue if len(line.split(" ")) == 2 and line.find(u"פרק") >= 0: continue line = bleach.clean(line, tags=[], strip=True) if line.isspace(): continue line = re.sub(u"(\n|\r)", u"", line) whole_text.append(line) else: continue else: root.set_element([gate, chapter], whole_text)
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot): ja_smk = JaggedArray(ja_smk) ja_raph = JaggedArray(ja_raph) ja_hagahot = JaggedArray(ja_hagahot) # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())): dict_lst = [] dict = {u'siman': [], u'smk': [], u'raph': []} for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())): # print numToHeb(i+1) dict['siman'] = numToHeb(i + 1) for i, smk_line in enumerate(seg[0]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line) if hag_lett: dict['smk'].extend([(hag_l, i + 1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] # print 'RAPH' for i, raph_line in enumerate(seg[1]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line) if hag_lett: dict['raph'].extend([(hag_l, i + 1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] dict_lst.append(dict) dict = {u'siman': [], u'smk': [], u'raph': []} return dict_lst
def hagahot_parse(ja_hagahot, hagahot_dict_lst): def num_haghot_in_siman(siman_dict): return len(siman_dict['smk']) + len(siman_dict['raph']) ja_hagahot = JaggedArray(ja_hagahot) ja_hagahot = ja_hagahot.flatten_to_array() hg_ja = [] p_hg = 0 for dict in hagahot_dict_lst: if re.search(u"^@[^1]", ja_hagahot[p_hg]): p_hg += 1 p_hg_end = p_hg + num_haghot_in_siman(dict) hg_ja.append(ja_hagahot[p_hg:p_hg_end]) p_hg = p_hg_end hg_ja.append(ja_hagahot[p_hg::]) ja_to_xml(hg_ja, ['siman', 'letter'], 'haghot_by_smk_simanim.xml') return hg_ja
def restructure_text(): with open('Derech Chaim text.json') as infile: version = json.load(infile) my_text = version['text'][u''] pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$') parsed = JaggedArray([[[]]]) for chap_index, chapter in enumerate(my_text): current_mishnah, current_comment = 0, 0 for line in chapter: match = pattern.search(line) if match is None: # This is a regular comment parsed.set_element([chap_index, current_mishnah, current_comment], line, pad=[]) current_comment += 1 else: m_value = getGematria(match.group(1)) - 1 if m_value > current_mishnah: # This condition allows for intro text to appear before first mishnah mark current_mishnah = m_value current_comment = 0 return parsed.array()
def parse(): with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile: lines = infile.readlines() gate, chapter, whole_text = -1, -1, [] root = JaggedArray([[]]) found_beginning = False beginning = re.compile( ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})' ) for line in lines: match = beginning.search(line) if match: if found_beginning: if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]): # strip out some unnecessary text root.set_element([gate, chapter], whole_text[1:], pad=[]) else: root.set_element([gate, chapter], whole_text, pad=[]) whole_text = [] else: found_beginning = True new_gate, new_chapter = getGematria( match.group(1)) - 1, getGematria(match.group(2)) - 1 if new_gate - gate > 1 or new_chapter - chapter > 1: print 'skip found at Gate {} Chapter {}'.format( new_gate + 1, new_chapter + 1) gate, chapter = new_gate, new_chapter elif found_beginning: if re.search(ur'<img', line): whole_text[-1] = add_image(line, whole_text[-1]) continue
def parse_unlinked(self): parsed = JaggedArray([[[]]]) comment_counter = Counter() for chapter in self.get_chapter(): chap_num = chapter.num for phrase in chapter.get_phrase(): phrase_num = phrase.subchap if phrase_num is None: raise AttributeError(u'Unlabeled phrase in {} chapter {}'.format(self.get_author(), chap_num)) comment_number = comment_counter[(chap_num, phrase_num)] parsed.set_element([int(chap_num)-1, int(phrase_num)-1, comment_number], phrase.as_string()) comment_counter[(chap_num, phrase_num)] += 1 unlinkedCommentStore.append({ 'commentator': commentatorNames[self.get_author()], 'chapter': chap_num, 'verse': phrase_num, 'order': str(comment_number+1) }) return parsed.array()
def restructure_text(): with open('Derech Chaim text.json') as infile: version = json.load(infile) my_text = version['text'][u''] pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$') parsed = JaggedArray([[[]]]) for chap_index, chapter in enumerate(my_text): current_mishnah, current_comment = 0, 0 for line in chapter: match = pattern.search(line) if match is None: # This is a regular comment parsed.set_element( [chap_index, current_mishnah, current_comment], line, pad=[]) current_comment += 1 else: m_value = getGematria(match.group(1)) - 1 if m_value > current_mishnah: # This condition allows for intro text to appear before first mishnah mark current_mishnah = m_value current_comment = 0 return parsed.array()
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot): ja_smk = JaggedArray(ja_smk) ja_raph = JaggedArray(ja_raph) ja_hagahot = JaggedArray(ja_hagahot) # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())): dict_lst = [] dict = {u'siman':[], u'smk':[], u'raph':[]} for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())): # print numToHeb(i+1) dict['siman'] = numToHeb(i+1) for i, smk_line in enumerate(seg[0]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line) if hag_lett: dict['smk'].extend([(hag_l, i+1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] # print 'RAPH' for i, raph_line in enumerate(seg[1]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line) if hag_lett: dict['raph'].extend([(hag_l, i+1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] dict_lst.append(dict) dict = {u'siman': [], u'smk': [], u'raph': []} return dict_lst
def parser(name): with codecs.open('{}.txt'.format(name), 'r', 'utf-8') as infile: lines = infile.readlines() parsed_text = JaggedArray([[[]]]) links = [] chapter, mishnah, comment = -1, -1, -1 for line in lines: if re.match(ur'@00\u05e4\u05e8\u05e7', line) is not None: chapter += 1 comment = -1 continue elif re.match(ur'@22', line) is not None: mishnah = getGematria( re.match(ur'@22([\u05d0-\u05ea]{1,2})', line).group(1)) - 1 comment = -1 continue
def parse(): with codecs.open('hebrew_or_neerav.html', 'r', 'windows-1255') as infile: lines = infile.readlines() gate, chapter, whole_text = -1, -1, [] root = JaggedArray([[]]) found_beginning = False next_line_subject = False subjects = [] main_pattern = re.compile(ur'^<b>חלק ([\u05d0-\u05ea]{1,2}) פרק ([\u05d0-\u05ea]{1,2})') for index, line in enumerate(lines): line = line.replace("(", "(<sub>") line = line.replace(")", "</sub>)") if next_line_subject == True: subjects.append(line) next_line_subject = False continue if line.find(u"חלק שביעי חלק הכינויים א") >= 0: return dealWithEnd(lines[index+1], lines[index+2:], root, subjects) main_match = main_pattern.search(line) if main_match: if found_beginning: root.set_element([gate, chapter], whole_text, pad=[]) whole_text = [] else: found_beginning = True new_gate, new_chapter = getGematria(main_match.group(1))-1, getGematria(main_match.group(2))-1 if new_gate - gate > 1 or new_chapter - chapter > 1: print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1) gate, chapter = new_gate, new_chapter elif found_beginning: if len(line.split(" ")) == 2 and line.find(u"חלק") >= 0: next_line_subject = True continue if len(line.split(" ")) == 2 and line.find(u"פרק") >= 0: continue line = bleach.clean(line, tags=[], strip=True) if line.isspace(): continue line = re.sub(u'(\n|\r)', u'', line) whole_text.append(line) else: continue else: root.set_element([gate, chapter], whole_text)
def parse_shokets(): with open('chesed_le-avraham.htm') as infile: soup = BeautifulSoup(infile, 'html.parser') raw_shokets = soup.find('div', class_='shokets').text.splitlines() raw_shokets = filter(lambda x: x if len(x) > 0 else None, raw_shokets) pattern = ur'(\u05d4\u05e9\u05d5?\u05e7\u05ea [\u05d0-\u05ea]{1,2})( - (.*))?:' parsed = JaggedArray([[]]) shoket, paragraph = -1, -1 for line in raw_shokets: new_section = re.search(pattern, line) if new_section is None: if shoket >= 0: paragraph += 1 parsed.set_element([shoket, paragraph], line) else: shoket += 1 paragraph = -1 if new_section.group(3) is not None: paragraph += 1 parsed.set_element([shoket, paragraph], u'<b>{}</b>'.format(new_section.group(3))) return parsed.array()
def jaggedarray_from_files(input_file, footnote_file): """ :param input_file: Main text file to parse :param footnote_file: Footnote text file to parase :return: A 3D jaggedArray of text from files. """ ja = JaggedArray([[]]) global footnotes global footnotes_parasha global link_refs link_refs = [] current = [] list_of_currents = [] footnotes = [] footnotes_parasha = {} links = [] text = codecs.open(footnote_file, 'r', 'utf-8') for line in text: footnotes.append(cleanup(line)) text.close() footnotes = iter(footnotes) main_text = codecs.open(input_file, 'r', 'utf-8') for line in main_text: if line.startswith('@22'): while current: list_of_currents.append(current) current = [] m = re.search(u'([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2}), ([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2})', line) # if with semicolon, choose first pasuk ignore second location = Ref(u"".join([u"בראשית ", m.group(1), u": ", m.group(2)])) link_refs.append(location) current.append(footnotify(u''.join([u"<strong>", cleanup(line), u"</strong>"]))) elif line.startswith('@88'): current[-1] += u''.join([u"<sup>*</sup><i class='footnote'>", cleanup(line), u"</i>", "<br>___________<br>"]) elif line.startswith('@11') or line.startswith('@33'): current.append(cleanup(footnotify(line))) elif line.startswith('@00'): #move line is None to own condition while current: list_of_currents.append(current) current = [] while list_of_currents: for x in list_of_currents: i = list_of_currents.index(x) location = [link_refs[i].sections[0] - 1, link_refs[i].sections[1] - 1] # if they start on same verse, append array to previous array if link_refs[i].sections[0] == link_refs[i - 1].sections[0] and link_refs[i].sections[1] == link_refs[i - 1].sections[1]: bereshit_ref = link_refs[i].normal() philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":", str(len(ja.get_element([location[0], location[1]]))+1), "-", str(len(ja.get_element([location[0], location[1]]))+len(x))]) #above line: base first on last number of element len(ja.get_element([location[0], locationo[1]])) links.append((bereshit_ref, philo_ref)) ja.get_element([location[0], location[1]]).extend(repeat_footnotify(x)) else: bereshit_ref = link_refs[i].normal() philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":1-", str(len(x))]) links.append((bereshit_ref, philo_ref)) ja.set_element([location[0], location[1]], repeat_footnotify(x), pad = []) footnotes_parasha.clear() current = [] link_refs = [] list_of_currents = [] main_text.close() #util.ja_to_xml(ja.array(), ['Chapter', 'Verse','Comment']) return ja.array(), links