Ejemplo n.º 1
0
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot):
    ja_smk = JaggedArray(ja_smk)
    ja_raph = JaggedArray(ja_raph)
    ja_hagahot = JaggedArray(ja_hagahot)
    # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())):
    dict_lst = []
    dict = {u'siman': [], u'smk': [], u'raph': []}
    for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())):
        # print numToHeb(i+1)
        dict['siman'] = numToHeb(i + 1)
        for i, smk_line in enumerate(seg[0]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)',
                                  smk_line)
            if hag_lett:
                dict['smk'].extend([(hag_l, i + 1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        # print 'RAPH'
        for i, raph_line in enumerate(seg[1]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)',
                                  raph_line)
            if hag_lett:
                dict['raph'].extend([(hag_l, i + 1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        dict_lst.append(dict)
        dict = {u'siman': [], u'smk': [], u'raph': []}
    return dict_lst
Ejemplo n.º 2
0
    def parse_unlinked(self):
        parsed = JaggedArray([[[]]])
        comment_counter = Counter()

        for chapter in self.get_chapter():
            chap_num = chapter.num
            for phrase in chapter.get_phrase():
                phrase_num = phrase.subchap

                if phrase_num is None:
                    raise AttributeError(
                        u'Unlabeled phrase in {} chapter {}'.format(
                            self.get_author(), chap_num))

                comment_number = comment_counter[(chap_num, phrase_num)]
                parsed.set_element(
                    [int(chap_num) - 1,
                     int(phrase_num) - 1, comment_number], phrase.as_string())
                comment_counter[(chap_num, phrase_num)] += 1

                unlinkedCommentStore.append({
                    'commentator':
                    commentatorNames[self.get_author()],
                    'chapter':
                    chap_num,
                    'verse':
                    phrase_num,
                    'order':
                    str(comment_number + 1)
                })
        return parsed.array()
Ejemplo n.º 3
0
 def parse_linked(self):
     parsed = JaggedArray([[[]]])
     for phrase in self.get_phrase():
         indices = (commentStore[phrase.id]['chapter'], commentStore[phrase.id]['verse'], commentStore[phrase.id]['order'])
         text = phrase.get_comment().valueOf_.replace(u'\n', u'')
         parsed.set_element([i-1 for i in indices], text)
     return parsed.array()
Ejemplo n.º 4
0
 def parse_linked(self):
     parsed = JaggedArray([[[]]])
     for phrase in self.get_phrase():
         indices = (commentStore[phrase.id]['chapter'],
                    commentStore[phrase.id]['verse'],
                    commentStore[phrase.id]['order'])
         text = phrase.get_comment().valueOf_.replace(u'\n', u' ')
         text = re.sub(u' +', u' ', text)
         text = re.sub(ur' (:|\.)', ur'\1', text)
         parsed.set_element([i - 1 for i in indices], text)
     return parsed.array()
Ejemplo n.º 5
0
def parse(filename):
    comment_store = populate_comment_store(filename)
    parsed = JaggedArray([[]])

    with open(filename) as infile:
        soup = BeautifulSoup(infile, 'xml')
    footnotes = soup.find_all('ftnote')
    for footnote in footnotes:
        loc = comment_store.get(footnote.attrs['id'])
        if loc is None:
            continue
        value = u''.join([unicode(child) for child in footnote.children])
        parsed.set_element([loc['chapter']-1, loc['verse']-1], structure_comments(value), pad=[])
    return parsed.array()
Ejemplo n.º 6
0
def parse(filename):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()

    targum_ja = JaggedArray([[[]]])
    indices = None

    for line_num, line in enumerate(lines):

        if (line_num + 1) % 2 == 1:
            indices = ref_to_indices(line)
        else:
            text_value = u' '.join(line.split(u' ')[1:])
            targum_ja.set_element(indices, text_value)
    return targum_ja.array()
Ejemplo n.º 7
0
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot):
    ja_smk = JaggedArray(ja_smk)
    ja_raph = JaggedArray(ja_raph)
    ja_hagahot = JaggedArray(ja_hagahot)
    # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())):
    dict_lst = []
    dict = {u'siman':[], u'smk':[], u'raph':[]}
    for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())):
        # print numToHeb(i+1)
        dict['siman'] = numToHeb(i+1)
        for i, smk_line in enumerate(seg[0]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line)
            if hag_lett:
                dict['smk'].extend([(hag_l, i+1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        # print 'RAPH'
        for i, raph_line in enumerate(seg[1]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line)
            if hag_lett:
                dict['raph'].extend([(hag_l, i+1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        dict_lst.append(dict)
        dict = {u'siman': [], u'smk': [], u'raph': []}
    return dict_lst
Ejemplo n.º 8
0
def parse_file(filename):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()
    jagged_array = JaggedArray([[[]]])

    segment = RambamSegment()
    for line in lines:
        if segment.is_quote(line):
            segment.add_raw_quote(line)
        elif segment.is_text(line):
            segment.add_text(line)
            segment.add_segment(jagged_array)

    return {'parsed text': jagged_array.array(),
            'links': segment.extract_links()}
def restructure_text():
    with open('Derech Chaim text.json') as infile:
        version = json.load(infile)
    my_text = version['text'][u'']

    pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$')
    parsed = JaggedArray([[[]]])
    for chap_index, chapter in enumerate(my_text):
        current_mishnah, current_comment = 0, 0

        for line in chapter:
            match  = pattern.search(line)
            if match is None:  # This is a regular comment
                parsed.set_element([chap_index, current_mishnah, current_comment], line, pad=[])
                current_comment += 1
            else:
                m_value = getGematria(match.group(1)) - 1
                if m_value > current_mishnah:  # This condition allows for intro text to appear before first mishnah mark
                    current_mishnah = m_value
                    current_comment = 0
    return parsed.array()
Ejemplo n.º 10
0
    def parse_unlinked(self):
        parsed = JaggedArray([[[]]])
        comment_counter = Counter()

        for chapter in self.get_chapter():
            chap_num = chapter.num
            for phrase in chapter.get_phrase():
                phrase_num = phrase.subchap

                if phrase_num is None:
                    raise AttributeError(u'Unlabeled phrase in {} chapter {}'.format(self.get_author(), chap_num))

                comment_number = comment_counter[(chap_num, phrase_num)]
                parsed.set_element([int(chap_num)-1, int(phrase_num)-1, comment_number], phrase.as_string())
                comment_counter[(chap_num, phrase_num)] += 1

                unlinkedCommentStore.append({
                    'commentator': commentatorNames[self.get_author()],
                    'chapter': chap_num,
                    'verse': phrase_num,
                    'order': str(comment_number+1)
                })
        return parsed.array()
Ejemplo n.º 11
0
def parse_shokets():
    with open('chesed_le-avraham.htm') as infile:
        soup = BeautifulSoup(infile, 'html.parser')
    raw_shokets = soup.find('div', class_='shokets').text.splitlines()
    raw_shokets = filter(lambda x: x if len(x) > 0 else None, raw_shokets)

    pattern = ur'(\u05d4\u05e9\u05d5?\u05e7\u05ea [\u05d0-\u05ea]{1,2})( - (.*))?:'
    parsed = JaggedArray([[]])
    shoket, paragraph = -1, -1

    for line in raw_shokets:
        new_section = re.search(pattern, line)
        if new_section is None:
            if shoket >= 0:
                paragraph += 1
                parsed.set_element([shoket, paragraph], line)
        else:
            shoket += 1
            paragraph = -1
            if new_section.group(3) is not None:
                paragraph += 1
                parsed.set_element([shoket, paragraph], u'<b>{}</b>'.format(new_section.group(3)))

    return parsed.array()
def restructure_text():
    with open('Derech Chaim text.json') as infile:
        version = json.load(infile)
    my_text = version['text'][u'']

    pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$')
    parsed = JaggedArray([[[]]])
    for chap_index, chapter in enumerate(my_text):
        current_mishnah, current_comment = 0, 0

        for line in chapter:
            match = pattern.search(line)
            if match is None:  # This is a regular comment
                parsed.set_element(
                    [chap_index, current_mishnah, current_comment],
                    line,
                    pad=[])
                current_comment += 1
            else:
                m_value = getGematria(match.group(1)) - 1
                if m_value > current_mishnah:  # This condition allows for intro text to appear before first mishnah mark
                    current_mishnah = m_value
                    current_comment = 0
    return parsed.array()
Ejemplo n.º 13
0
def jaggedarray_from_files(input_file, footnote_file):
    """
    :param input_file: Main text file to parse
    :param footnote_file: Footnote text file to parase
    :return: A 3D jaggedArray of text from files.
    """

    ja = JaggedArray([[]])
    global footnotes
    global footnotes_parasha
    global link_refs
    link_refs = []
    current = []
    list_of_currents = []
    footnotes = []
    footnotes_parasha = {}
    links = []

    text = codecs.open(footnote_file, 'r', 'utf-8')
    for line in text:
        footnotes.append(cleanup(line))
    text.close()
    footnotes = iter(footnotes)
    main_text = codecs.open(input_file, 'r', 'utf-8')

    for line in main_text:
        if line.startswith('@22'):
            while current:
                list_of_currents.append(current)
                current = []
            m = re.search(u'([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2}), ([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2})', line)
            # if with semicolon, choose first pasuk ignore second
            location = Ref(u"".join([u"בראשית ", m.group(1), u": ", m.group(2)]))
            link_refs.append(location)
            current.append(footnotify(u''.join([u"<strong>", cleanup(line), u"</strong>"])))
        elif line.startswith('@88'):
            current[-1] += u''.join([u"<sup>*</sup><i class='footnote'>", cleanup(line), u"</i>", "<br>___________<br>"])
        elif line.startswith('@11') or line.startswith('@33'):
            current.append(cleanup(footnotify(line)))
        elif line.startswith('@00'): #move line is None to own condition
            while current:
                list_of_currents.append(current)
                current = []
            while list_of_currents:
                for x in list_of_currents:
                    i = list_of_currents.index(x)
                    location = [link_refs[i].sections[0] - 1, link_refs[i].sections[1] - 1]
                    # if they start on same verse, append array to previous array
                    if link_refs[i].sections[0] == link_refs[i - 1].sections[0] and link_refs[i].sections[1] == link_refs[i - 1].sections[1]:
                        bereshit_ref = link_refs[i].normal()
                        philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":", str(len(ja.get_element([location[0], location[1]]))+1), "-", str(len(ja.get_element([location[0], location[1]]))+len(x))])
                        #above line: base first on last number of element len(ja.get_element([location[0], locationo[1]]))
                        links.append((bereshit_ref, philo_ref))
                        ja.get_element([location[0], location[1]]).extend(repeat_footnotify(x))
                    else:
                        bereshit_ref = link_refs[i].normal()
                        philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":1-", str(len(x))])
                        links.append((bereshit_ref, philo_ref))
                        ja.set_element([location[0], location[1]], repeat_footnotify(x), pad = [])
                footnotes_parasha.clear()
                current = []
                link_refs = []
                list_of_currents = []


    main_text.close()

    #util.ja_to_xml(ja.array(), ['Chapter', 'Verse','Comment'])
    return ja.array(), links