Beispiel #1
0
    def __init__(self, lexicon_name):
        super(LexiconTrie, self).__init__(self.dict_letter_scope)

        for entry in LexiconEntrySet({"parent_lexicon": lexicon_name}, sort=[("_id", -1)]):
            self[hebrew.strip_nikkud(entry.headword)] = entry.headword
            for ahw in getattr(entry, "alt_headwords", []):
                self[hebrew.strip_nikkud(ahw)] = entry.headword
def clean_line(line):
    line = strip_nikkud(line)
    line = re.sub(u':', '', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    pos = re.search(reg_ayyen_tur, line)

    if pos:
        line = line[:pos.start()]

    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
Beispiel #3
0
def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
Beispiel #4
0
def clean_line(line):
    line = strip_nikkud(line)
    replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right?
    line = multiple_replace(line, replace_dict, using_regex=True)
    # line = re.sub(u'[:\?]', '', line)
    # line = re.sub(u'”', u'"', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    f_ayyen = re.search(reg_ayyen_tur, line)
    f_lo_manu = re.search(reg_lo_manu, line)

    if f_ayyen:
        line = line[:f_ayyen.start()]
    if f_lo_manu:
        line = re.sub(f_lo_manu.group('a'), u"", line)
    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1', line)  # brackets are always correct
    else:
        clean = line
    return clean
Beispiel #5
0
def clean_line(line):
    line = strip_nikkud(line)
    line = re.sub(u':', '', line)
    reg_parentheses = re.compile(u'\((.*?)\)')
    reg_brackets = re.compile(u'\[(.*?)\]')
    in_per = reg_parentheses.search(line)
    in_bra = reg_brackets.search(line)
    reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''')
    line = re.sub(u'\[.*?אלפס.*?\]', u'', line)
    line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line)
    pos = re.search(reg_ayyen_tur, line)

    if pos:
        line = line[:pos.start()]

    if in_per:
        if in_bra:
            clean = re.sub(reg_brackets, ur'\1',
                           line)  # brackets are always correct
            clean = re.sub(reg_parentheses, '', clean)
        else:
            clean = re.sub(reg_parentheses, ur'\1', line)
    elif in_bra:
        clean = re.sub(reg_brackets, ur'\1',
                       line)  # brackets are always correct
    else:
        clean = line
    return clean
 def body(self):
     self.load_toc()
     self.click_toc_category("Midrash")
     self.click_toc_text("Ein Yaakov")
     self.click_source_title()
     self.click_masechet_and_chapter('2', '3')
     section = self.get_section_txt('1')
     assert 'רבי זירא הוה קא משתמיט' in strip_nikkud(section)
Beispiel #7
0
    def find_in_segment(self, st, lang='he', citing_only=False, replace=True):
        #todo: implemant replace = True
        """
        Returns a list of Ref objects derived from string

        :param string st: the input string
        :param lang: "he" note: "en" is not yet supported in ibid
        :param citing_only: boolean whether to use only records explicitly marked as being referenced in text.
        :return: list of :class:`Ref` objects, list of locations and list of ref types (either REF or SHAM, defined in CitationFinder)
        """
        refs = []
        locations = []
        types = []
        failed_refs = []
        # failed_non_refs = []
        failed_shams = []
        assert lang == 'he'
        # todo: support english

        st = strip_nikkud(st)
        all_refs = self._citationFinder.get_potential_refs(
            st, lang, citing_only=citing_only)
        for item, location, type in all_refs:
            if type == CitationFinder.REF_INT:
                try:
                    refs += [
                        self._tr.resolve(item.index_node.full_title(),
                                         item.sections)
                    ]
                    locations += [location]
                    types += [type]
                    #refs += [(self._tr.resolve(item.index_node.full_title(), item.sections), 'ref')]
                except (IbidRefException, IbidKeyNotFoundException) as e:
                    failed_refs += [item.normal()]
            elif type == CitationFinder.NON_REF_INT or type == CitationFinder.IGNORE_INT:
                # failed_non_refs += [item.group()]
                self._tr.ignore_book_name_keys()
            elif type == CitationFinder.SHAM_INT:
                try:
                    if isinstance(item, str):
                        refs += [self._tr.resolve(None, match_str=item)]
                        locations += [location]
                        types += [type]
                        #refs += [(self._tr.resolve(None, match_str=item), 'sham')]
                    else:
                        refs += [self._tr.resolve(item[0], sections=item[1])]
                        locations += [location]
                        types += [type]
                        #refs += [(self._tr.resolve(item[0], sections=item[1]), 'sham')]
                except (IbidRefException, IbidKeyNotFoundException,
                        InputError) as e:
                    failed_shams += [item]

        return refs, locations, types  # , failed_refs, failed_non_refs, failed_shams
Beispiel #8
0
    def find_in_segment(self, st, lang='he', citing_only=False, replace=True):
        #todo: implemant replace = True
        """
        Returns a list of Ref objects derived from string

        :param string st: the input string
        :param lang: "he" note: "en" is not yet supported in ibid
        :param citing_only: boolean whether to use only records explicitly marked as being referenced in text.
        :return: list of :class:`Ref` objects, list of locations and list of ref types (either REF or SHAM, defined in CitationFinder)
        """
        refs = []
        locations = []
        types = []
        failed_refs = []
        # failed_non_refs = []
        failed_shams = []
        assert lang == 'he'
        # todo: support english

        st = strip_nikkud(st)
        all_refs = self._citationFinder.get_potential_refs(st, lang)
        for item, location, type in all_refs:
            if type == CitationFinder.REF_INT:
                try:
                    refs += [self._tr.resolve(item.index_node.full_title(), item.sections)]
                    locations += [location]
                    types += [type]
                    #refs += [(self._tr.resolve(item.index_node.full_title(), item.sections), 'ref')]
                except (IbidRefException, IbidKeyNotFoundException) as e:
                    failed_refs += [item.normal()]
            elif type == CitationFinder.NON_REF_INT or type == CitationFinder.IGNORE_INT:
                # failed_non_refs += [item.group()]
                self._tr.ignore_book_name_keys()
            elif type == CitationFinder.SHAM_INT:
                try:
                    if isinstance(item, unicode):
                        refs += [self._tr.resolve(None, match_str=item)]
                        locations += [location]
                        types += [type]
                        #refs += [(self._tr.resolve(None, match_str=item), 'sham')]
                    else:
                        refs += [self._tr.resolve(item[0], sections=item[1])]
                        locations += [location]
                        types += [type]
                        #refs += [(self._tr.resolve(item[0], sections=item[1]), 'sham')]
                except (IbidRefException, IbidKeyNotFoundException, InputError) as e:
                    failed_shams += [item]

        return refs, locations, types  # , failed_refs, failed_non_refs, failed_shams
Beispiel #9
0
    def body(self):
        self.load_toc()
        self.click_toc_category("Midrash")
        self.click_toc_text("Ein Yaakov")
        self.click_source_title()
        self.click_masechet_and_chapter('2', '3')
        section = self.get_section_txt('1')
        assert 'רבי זירא הוה' in strip_nikkud(section)

        self.load_toc()
        self.click_toc_category("Midrash").click_toc_text("Midrash Mishlei")
        self.click_source_title()
        self.click_chapter('4')
        section = self.get_section_txt('1')
        assert 'מכל משמר נצור ליבך' in section
Beispiel #10
0
def create_section(oref, dicta_text, dicta_vtitle):
    with_nikkud, without_nukkud = dicta_text.split(), strip_nikkud(
        dicta_text).split()
    sefaria_text = prepare_sefaria_text(oref)

    dh_match = match_text(without_nukkud, sefaria_text)

    matches = dh_match['matches']
    segments = oref.all_segment_refs()
    assert len(segments) == len(matches)

    for segment, match in zip(segments, matches):
        tc = segment.text('he', dicta_vtitle)
        new_segment_text = u' '.join(with_nikkud[match[0]:match[1] + 1])
        if not new_segment_text:
            new_segment_text = segment.text('he', davidson_vtitle).text

        tc.text = new_segment_text
        tc.save()
Beispiel #11
0
def count_terms(query={}, lang=None):
    #todo: move to object model.  Maybe.  What's this doing?
    """
    Counts all terms in texts matching query, lang
    Saves reults to terms collection in db.
    """
    terms = {}
    bavli_names = db.index.find(query).distinct("title")
    query = {"title": {"$in": bavli_names}}
    refs = counts.generate_refs_list(
        query)  #library.ref_list() needs query argument
    lookup_lang = "he" if lang == "ar" else lang

    for ref in refs:
        print(ref)
        #text = texts.get_text(ref, commentary=False)
        text = TextFamily(Ref(ref), commentary=False).contents()
        for i, line in enumerate(text.get(lookup_lang, [])):
            # strip punctuation
            for c in string.punctuation:
                line = line.replace(c, "")
            these_terms = line.split(" ")
            for term in these_terms:
                line_ref = "%s:%d" % (ref, i + 1)
                term = hebrew.strip_nikkud(term)
                if term in terms:
                    terms[term]["occurrences"] += 1
                    terms[term]["refs"].add(line_ref)
                else:
                    terms[term] = {
                        "term": term,
                        "occurrences": 1,
                        "language": lang,
                        "refs": set([line_ref])
                    }

    for term in terms:
        print(term)
        # only include up to 20 random ref samples
        sample_size = len(
            terms[term]["refs"]) if len(terms[term]["refs"]) < 20 else 20
        terms[term]["refs"] = list(sample(terms[term]["refs"], sample_size))
        db.terms.save(terms[term])
Beispiel #12
0
def count_terms(query={}, lang=None):
    #todo: move to object model.  Maybe.  What's this doing?
    """
    Counts all terms in texts matching query, lang
    Saves reults to terms collection in db.
    """
    terms = {}
    bavli_names = db.index.find(query).distinct("title")
    query = {"title": {"$in": bavli_names}}
    refs = counts.generate_refs_list(query)  #library.ref_list() needs query argument
    lookup_lang = "he" if lang == "ar" else lang

    for ref in refs:
        print ref
        #text = texts.get_text(ref, commentary=False)
        text = TextFamily(Ref(ref), commentary=False).contents()
        for i, line in enumerate(text.get(lookup_lang, [])):
            # strip punctuation
            for c in string.punctuation:
                line = line.replace(c,"")
            these_terms = line.split(" ")
            for term in these_terms:
                line_ref = "%s:%d" % (ref, i+1)
                term = hebrew.strip_nikkud(term)
                if term in terms:
                    terms[term]["occurrences"] += 1
                    terms[term]["refs"].add(line_ref)
                else:
                    terms[term] = {
                        "term": term,
                        "occurrences": 1,
                        "language": lang,
                        "refs": set([line_ref])
                    }

    for term in terms:
        print term
        # only include up to 20 random ref samples
        sample_size = len(terms[term]["refs"]) if len(terms[term]["refs"]) < 20 else 20
        terms[term]["refs"] = list(sample(terms[term]["refs"], sample_size))
        db.terms.save(terms[term])
Beispiel #13
0
    def find_all_shams_in_st(self, st, lang = 'he'):
        '''

        :param st: source text
        :param lang:
        :return: a list of tuples (Refs objects that originally were Shams, location)
        '''
        from sefaria.utils.hebrew import strip_nikkud
        st = strip_nikkud(st)
        sham_refs = []
        reg = u'(\(|\([^)]* )שם(\)| [^(]*\))'  # finds shams in parenthesis without רבשם
        for sham in re.finditer(reg, st):
            matched = sham.group()
            if len(re.split('\s+', matched)) > 6: # todo: find stitistics of the cutoff size of a ref-citation 6 is a guess
                continue
            try:
                sham_refs += [(self.parse_sham(matched),sham.span())]
            except IbidKeyNotFoundException:
                pass
            except IbidRefException:
                pass # maybe want to call ignore her?
        return sham_refs
Beispiel #14
0
    def find_all_shams_in_st(self, st, lang = 'he'):
        '''

        :param st: source text
        :param lang:
        :return: a list of tuples (Refs objects that originally were Shams, location)
        '''
        from sefaria.utils.hebrew import strip_nikkud
        st = strip_nikkud(st)
        sham_refs = []
        reg = u'(\(|\([^)]* )שם(\)| [^(]*\))'  # finds shams in parenthesis without רבשם
        for sham in re.finditer(reg, st):
            matched = sham.group()
            if len(re.split('\s+', matched)) > 6: # todo: find stitistics of the cutoff size of a ref-citation 6 is a guess
                continue
            try:
                sham_refs += [(self.parse_sham(matched),sham.span())]
            except IbidKeyNotFoundException:
                pass
            except IbidRefException:
                pass # maybe want to call ignore her?
        return sham_refs
Beispiel #15
0
def run_shaminator(titles=None, with_real_refs=False, SEG_DIST = 5, create_ref_dict = True):
    base_url = u"https://www.sefaria.org/"

    title_list = []
    cats = ["Midrash", "Halakha", "Philosophy"]
    collective_titles = ["Rashi", "Kessef Mishneh"]
    for cat in cats:
        title_list += library.get_indexes_in_category(cat)
    for cTitle in collective_titles:
        title_list += library.get_indices_by_collective_title(cTitle)

    title_list = titles
    for ititle, title in enumerate(title_list):
        print u"-"*50
        print title, ititle+1, '/', len(title_list)
        print u"-"*50

        html = u"""
        <!DOCTYPE html>
        <html>
            <head>
                <link rel='stylesheet' type='text/css' href='styles.css'>
                <meta charset='utf-8'>
            </head>
            <body>
                <table>
                    <tr><td>Row Id</td><td>Book Ref</td><td>Ref Found</td><td>Sham Found</td><td>Sham Text</td></tr>
        """

        index = library.get_index(title)
        inst = IndexIbidFinder(index)
        if create_ref_dict:
            try:
                ref_dict = inst.find_in_index()
                # ref_dict - OrderedDict. keys: segments. values: dict {'refs': [Refs obj found in this seg], 'locations': [], 'types': []}
            except AssertionError:
                print "Skipping {}".format(title)
                continue # problem with Ein Ayah

        last_index_ref_seen = {}
        row_num = 1
        char_padding = 20
        double_tanakh_books = {"I Samuel": "Samuel", "II Samuel": "Samuel", "I Kings": "Kings", "II Kings": "Kings",
                               "I Chronicles": "Chronicles", "II Chronicles": "Chronicles"}
        for k, v in ref_dict.items():
            curr_ref = Ref(k)
            for i, (r, l, t) in enumerate(izip(v['refs'], v['locations'], v['types'])):
                sham_ref_key = r.index.title if r.index.title not in double_tanakh_books else double_tanakh_books[
                    r.index.title]
                if t == CitationFinder.SHAM_INT and last_index_ref_seen[sham_ref_key] is not None:
                    last_ref_with_citation, last_location_with_citation, last_ref_seen = last_index_ref_seen[sham_ref_key]
                else:  # if t == CitationFinder.REF_INT:
                    last_index_ref_seen[sham_ref_key] = (curr_ref, l, r)
                    if not with_real_refs:
                        continue
                    dist = curr_ref.distance(last_ref_with_citation)
                    last_ref_with_citation = curr_ref
                    last_location_with_citation = l
                    last_ref_seen = r
                    r = u"N/A"


                # dist = curr_ref.distance(last_ref_with_citation)
                print dist
                if dist == 0:
                    text = strip_nikkud(curr_ref.text('he').text)

                    start_ind = 0 if last_location_with_citation[0] - char_padding < 0 else last_location_with_citation[
                                                                                                0] - char_padding
                    end_ind = l[1] + char_padding

                    before = text[start_ind:last_location_with_citation[0]]
                    real_ref = text[last_location_with_citation[0]:last_location_with_citation[1]]
                    middle = text[last_location_with_citation[1]:l[0]] if last_location_with_citation[1] <= l[0] else u""
                    sham_ref = text[l[0]:l[1]] if t == CitationFinder.SHAM_INT else u""
                    after = text[l[1]:end_ind]
                    text = u"{}<span class='r'>{}</span>{}<span class='s'>{}</span>{}".format(before, real_ref, middle,
                                                                                              sham_ref, after)

                else:
                    start_text = strip_nikkud(last_ref_with_citation.text('he').text)
                    # start_text = strip_nikkud(start_text)[last_location_with_citation[0]:]
                    end_text = strip_nikkud(curr_ref.text('he').text)
                    # end_text = strip_nikkud(end_text)[:l[1]+1]
                    if dist > SEG_DIST:
                        continue
                    elif dist > 1 and  dist <= SEG_DIST:
                        print u"{} {} {}".format(curr_ref, last_ref_with_citation.next_segment_ref(),
                                                 curr_ref.prev_segment_ref())
                        mid_text = last_ref_with_citation.next_segment_ref().to(curr_ref.prev_segment_ref()).text(
                            'he').text
                        while isinstance(mid_text, list):
                            mid_text = reduce(lambda a, b: a + b, mid_text)
                    else:
                        mid_text = u""

                    start_ind = 0 if last_location_with_citation[0] - char_padding < 0 else last_location_with_citation[
                                                                                                0] - char_padding
                    end_ind = l[1] + char_padding

                    start_before = start_text[start_ind:last_location_with_citation[0]]
                    start_real_ref = start_text[last_location_with_citation[0]:last_location_with_citation[1]]
                    start_after = start_text[last_location_with_citation[1]:]

                    end_before = end_text[:l[0]]
                    end_sham_ref = end_text[l[0]:l[1]]
                    end_after = end_text[l[1]:end_ind]
                    text = u"{}<span class='r'>{}</span>{} {} {}<span class='s'>{}</span>{}".format(start_before,
                                                                                                    start_real_ref,
                                                                                                    start_after,
                                                                                                    mid_text,
                                                                                                    end_before,
                                                                                                    end_sham_ref,
                                                                                                    end_after)

                text = bleach.clean(text, strip=True, tags=[u'span'], attributes=[u'class'])
                # surround all non interesting parens with spans
                text = re.sub(ur"(?<!>)(\([^)]+\))(?!<)", ur"<span class='p'>\1</span>", text)

                rowclass = u"realrefrow" if t == CitationFinder.REF_INT else u"shamrefrow"
                row = u"<tr class='{}' ><td>{}</td><td><a href='{}' target='_blank'>{}</a></td><td>{}</td><td>{}</td><td class='he'>{}</td></tr>"\
                    .format(rowclass, row_num, base_url + curr_ref.url(), k, last_ref_seen, r, text)
                html += row
                row_num += 1

        html += u"""
                </table>
            </body>
        </html>
        """

        with codecs.open('ibid_output/ibid_{}.html'.format(title), 'wb', encoding='utf8') as f:
            f.write(html)
import sefaria.tracker as tracker

patterns = [u"כתרגומו", u"ותרגומו", u"תרגומו", u"וזהו שתרגם אונקלוס", u"אונקלוס", u"לכך מתרגם", u"מתרגם"]

books = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]
total = 0
for book in books:
    rashi_book = "Rashi on " + book
    onkelos_book = "Onkelos " + book
    i = library.get_index(rashi_book)
    assert isinstance(i, CommentaryIndex)
    all_rashis = i.all_segment_refs()

    # Loop through all of the Rashis
    for rashi_ref in all_rashis:
        rashi = strip_nikkud(TextChunk(rashi_ref, "he", "On Your Way").text)

        # If it matches the pattern
        for pat in patterns:
            if pat in rashi:
                onkelos_ref = Ref(rashi_ref.section_ref().normal().replace(rashi_book, onkelos_book))
                d = {
                    "refs": [rashi_ref.normal(), onkelos_ref.normal()],
                    "type": "reference",
                    "auto": True,
                    "generated_by": "Rashi - Onkelos Linker",
                }
                tracker.add(28, Link, d)
                print u"{}\t{}\t{}".format(rashi_ref.normal(), pat, rashi.strip())
                total += 1
                break
Beispiel #17
0
 def test_strip_nikkud(self):
     assert h.strip_nikkud('הַדְּבָרִים אֲשֶׁר') == 'הדברים אשר'
     assert h.strip_nikkud(
         "הַמּוֹצִיא בְמִסְפָּר צְבָאָם לְכֻלָּם בְּשֵׁם יִקְרָא"
     ) == "המוציא במספר צבאם לכלם בשם יקרא"
Beispiel #18
0
def run_shaminator(titles=None, with_real_refs=False):
    base_url = u"https://www.sefaria.org/"

    title_list = []
    cats = ["Midrash", "Halakha", "Philosophy"]
    collective_titles = ["Rashi", "Kessef Mishneh"]
    for cat in cats:
        title_list += library.get_indexes_in_category(cat)
    for cTitle in collective_titles:
        title_list += library.get_indices_by_collective_title(cTitle)

    title_list = titles
    for ititle, title in enumerate(title_list):
        print u"-"*50
        print title, ititle+1, '/', len(title_list)
        print u"-"*50

        html = u"""
        <!DOCTYPE html>
        <html>
            <head>
                <link rel='stylesheet' type='text/css' href='styles.css'>
                <meta charset='utf-8'>
            </head>
            <body>
                <table>
                    <tr><td>Row Id</td><td>Book Ref</td><td>Ref Found</td><td>Sham Found</td><td>Sham Text</td></tr>
        """

        index = library.get_index(title)
        inst = IndexIbidFinder(index)
        try:
            ref_dict = inst.find_in_index()
        except AssertionError:
            print "Skipping {}".format(title)
            continue # problem with Ein Ayah

        last_index_ref_seen = {}
        row_num = 1
        char_padding = 20
        double_tanakh_books = {"I Samuel": "Samuel", "II Samuel": "Samuel", "I Kings": "Kings", "II Kings": "Kings",
                               "I Chronicles": "Chronicles", "II Chronicles": "Chronicles"}
        for k, v in ref_dict.items():
            curr_ref = Ref(k)
            for r, l, t in izip(v['refs'], v['locations'], v['types']):
                sham_ref_key = r.index.title if r.index.title not in double_tanakh_books else double_tanakh_books[
                    r.index.title]
                if t == CitationFinder.SHAM_INT and last_index_ref_seen[sham_ref_key] is not None:
                    last_ref_with_citation, last_location_with_citation, last_ref_seen = last_index_ref_seen[sham_ref_key]
                else:  # if t == CitationFinder.REF_INT:
                    last_index_ref_seen[sham_ref_key] = (curr_ref, l, r)
                    if not with_real_refs:
                        continue
                    last_ref_with_citation = curr_ref
                    last_location_with_citation = l
                    last_ref_seen = r
                    r = u"N/A"


                dist = curr_ref.distance(last_ref_with_citation)
                if dist == 0:
                    text = strip_nikkud(curr_ref.text('he').text)

                    start_ind = 0 if last_location_with_citation[0] - char_padding < 0 else last_location_with_citation[
                                                                                                0] - char_padding
                    end_ind = l[1] + char_padding

                    before = text[start_ind:last_location_with_citation[0]]
                    real_ref = text[last_location_with_citation[0]:last_location_with_citation[1]]
                    middle = text[last_location_with_citation[1]:l[0]] if last_location_with_citation[1] <= l[0] else u""
                    sham_ref = text[l[0]:l[1]] if t == CitationFinder.SHAM_INT else u""
                    after = text[l[1]:end_ind]
                    text = u"{}<span class='r'>{}</span>{}<span class='s'>{}</span>{}".format(before, real_ref, middle,
                                                                                              sham_ref, after)

                else:
                    start_text = strip_nikkud(last_ref_with_citation.text('he').text)
                    # start_text = strip_nikkud(start_text)[last_location_with_citation[0]:]
                    end_text = strip_nikkud(curr_ref.text('he').text)
                    # end_text = strip_nikkud(end_text)[:l[1]+1]
                    if dist > 1:
                        print u"{} {} {}".format(curr_ref, last_ref_with_citation.next_segment_ref(),
                                                 curr_ref.prev_segment_ref())
                        mid_text = last_ref_with_citation.next_segment_ref().to(curr_ref.prev_segment_ref()).text(
                            'he').text
                        while isinstance(mid_text, list):
                            mid_text = reduce(lambda a, b: a + b, mid_text)
                    else:
                        mid_text = u""

                    start_ind = 0 if last_location_with_citation[0] - char_padding < 0 else last_location_with_citation[
                                                                                                0] - char_padding
                    end_ind = l[1] + char_padding

                    start_before = start_text[start_ind:last_location_with_citation[0]]
                    start_real_ref = start_text[last_location_with_citation[0]:last_location_with_citation[1]]
                    start_after = start_text[last_location_with_citation[1]:]

                    end_before = end_text[:l[0]]
                    end_sham_ref = end_text[l[0]:l[1]]
                    end_after = end_text[l[1]:end_ind]
                    text = u"{}<span class='r'>{}</span>{} {} {}<span class='s'>{}</span>{}".format(start_before,
                                                                                                    start_real_ref,
                                                                                                    start_after,
                                                                                                    mid_text,
                                                                                                    end_before,
                                                                                                    end_sham_ref,
                                                                                                    end_after)

                text = bleach.clean(text, strip=True, tags=[u'span'], attributes=[u'class'])
                # surround all non interesting parens with spans
                text = re.sub(ur"(?<!>)(\([^)]+\))(?!<)", ur"<span class='p'>\1</span>", text)

                rowclass = u"realrefrow" if t == CitationFinder.REF_INT else u"shamrefrow"
                row = u"<tr class='{}' ><td>{}</td><td><a href='{}' target='_blank'>{}</a></td><td>{}</td><td>{}</td><td class='he'>{}</td></tr>"\
                    .format(rowclass, row_num, base_url + curr_ref.url(), k, last_ref_seen, r, text)
                html += row
                row_num += 1

        html += u"""
                </table>
            </body>
        </html>
        """

        with codecs.open('ibid_output/ibid_{}.html'.format(title), 'wb',encoding='utf8') as f:
            f.write(html)
for sheet in sheet_list:
    for tag in sheet.get("tags", []):
        original_tag_counter[tag] += 1

sorted_tags = sorted(original_tag_counter,
                     key=original_tag_counter.get,
                     reverse=True)
sorted_en_tags = [t for t in sorted_tags if not is_hebrew(t)]
sorted_he_tags = [t for t in sorted_tags if is_hebrew(t)]

translated_hebrew_tags = defaultdict(list)
for en_tag in sorted_en_tags:
    translation = translate_client.translate(en_tag,
                                             target_language='iw',
                                             source_language='en')
    he_tag = strip_nikkud(translation['translatedText'])
    if en_tag == he_tag:
        print("Couldn't translate {}".format(en_tag))
        untranslated_en_tags += [en_tag]
        continue
    print("{}:{}".format(he_tag, en_tag))
    translated_hebrew_tags[he_tag] += [en_tag]

overall_counts = {
    he_tag: sum([
        original_tag_counter[en_tag]
        for en_tag in translated_hebrew_tags[he_tag]
    ])
    for he_tag in translated_hebrew_tags
}
ordered_translated_he_terms = sorted(overall_counts,
    u"כתרגומו", u"ותרגומו", u"תרגומו", u"וזהו שתרגם אונקלוס", u"אונקלוס",
    u"לכך מתרגם", u"מתרגם"
]

books = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]
total = 0
for book in books:
    rashi_book = "Rashi on " + book
    onkelos_book = "Onkelos " + book
    i = library.get_index(rashi_book)
    assert isinstance(i, CommentaryIndex)
    all_rashis = i.all_segment_refs()

    # Loop through all of the Rashis
    for rashi_ref in all_rashis:
        rashi = strip_nikkud(TextChunk(rashi_ref, "he", "On Your Way").text)

        # If it matches the pattern
        for pat in patterns:
            if pat in rashi:
                onkelos_ref = Ref(rashi_ref.section_ref().normal().replace(
                    rashi_book, onkelos_book))
                d = {
                    "refs": [rashi_ref.normal(),
                             onkelos_ref.normal()],
                    "type": "reference",
                    "auto": True,
                    "generated_by": "Rashi - Onkelos Linker"
                }
                tracker.add(28, Link, d)
                print u"{}\t{}\t{}".format(rashi_ref.normal(), pat,
Beispiel #21
0
 def text_strip_nikkud(self):
     assert h.strip_nikkud(u'הַדְּבָרִים אֲשֶׁר') == u'הדברים אשר'
 def test_strip_nikkud(self):
     assert h.strip_nikkud(u'הַדְּבָרִים אֲשֶׁר') == u'הדברים אשר'
     assert h.strip_nikkud(u"הַמּוֹצִיא בְמִסְפָּר צְבָאָם לְכֻלָּם בְּשֵׁם יִקְרָא") == u"המוציא במספר צבאם לכלם בשם יקרא"
Beispiel #23
0
 def text_strip_nikkud(self):
     assert h.strip_nikkud(u'הַדְּבָרִים אֲשֶׁר') == u'הדברים אשר'