def export_stats_by_ref(refs, filename):
    with open("%s/files/%s" % (STATICFILES_DIRS[0], filename),
              'wb') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "ref",
            "letters",
            "words",
            "verses",
            "gematria",
            "gematriaHex",
            "gematria18Hex",
        ])

        for ref in refs:
            print(ref)
            oRef = Ref(ref)
            text = oRef.text(lang="he", vtitle="Tanach with Text Only")
            writer.writerow([
                ref,
                len(text.as_string().replace(" ", "")),
                text.word_count(),
                text.verse_count(),
                gematria(text.as_string()),
                "{0:#0{1}x}".format(gematria(text.as_string()),
                                    8).replace("0x", "#"),
                "{0:#0{1}x}".format(gematria(text.as_string()) * 18,
                                    8).replace("0x", "#"),
            ])
def export_stats_by_ref(refs, filename):
    with open("%s/files/%s" % (STATICFILES_DIRS[0], filename), 'wb') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
                            "ref",
                            "letters",
                            "words",
                            "verses",
                            "gematria",
                            "gematriaHex",
                            "gematria18Hex",
                         ])

        for ref in refs:
            print ref
            oRef = Ref(ref)
            text = oRef.text(lang="he", vtitle="Tanach with Text Only")
            writer.writerow([
                                ref,
                                len(text.as_string().replace(" ", "")),
                                text.word_count(),
                                text.verse_count(),
                                gematria(text.as_string()),
                                "{0:#0{1}x}".format(gematria(text.as_string()),8).replace("0x", "#"),
                                "{0:#0{1}x}".format(gematria(text.as_string()) * 18,8).replace("0x", "#"),

                            ])
def export_stats_by_ref(refs, filename):
    with open("%s/misc/%s" % (SEFARIA_EXPORT_PATH, filename), 'wb') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
                            "Ref",
                            "Letters",
                            "Words",
                            "Verses",
                            "Gematria",
                            "Gematria Hex",
                            "Gematria x 18 Hex",

                         ])

        for ref in refs:
            print ref
            oRef = Ref(ref)
            text = oRef.text(lang="he", vtitle="Tanach with Text Only")
            writer.writerow([
                                ref,
                                len(text.as_string().replace(" ", "")),
                                text.word_count(),
                                text.verse_count(),
                                gematria(text.as_string()),
                                "{0:#0{1}x}".format(gematria(text.as_string()),8).replace("0x", "#"),
                                "{0:#0{1}x}".format(gematria(text.as_string()) * 18,8).replace("0x", "#"),

                            ])
 def get_location(html_fragment):
     soup = html_fragment
     if re.search(u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup):
         m = re.search(u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup)
         return gematria(m.group(1))
     else:
         m = re.match(u'@01פרק\s(.{1,2})', soup)
         return gematria(m.group(1))
 def get_location(html_fragment):
     soup = html_fragment
     if re.search(u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}',
                  soup):
         m = re.search(
             u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup)
         return gematria(m.group(1))
     else:
         m = re.match(u'@01פרק\s(.{1,2})', soup)
         return gematria(m.group(1))
Exemple #6
0
def update_topic(row):
    if len(row["topic"]) > 0:
        m = re.match(r"([\u05d0-\u05ea]{1,4})\. (.+)$", row["topic"])
        if m is None:
            print(row)
            return None
        topic_num = gematria(m.group(1))
        topic_name = m.group(2)
        return topic_name, topic_num
    else:
        return None
Exemple #7
0
def update_chapter(row):
    if len(row["chapter"]) > 0:
        m = re.match(r"\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,3}) (.+)$",
                     row["chapter"])
        if m is None:
            print(row)
            return None
        chapter_num = gematria(m.group(1))
        chapter_name = m.group(2)
        return chapter_name, chapter_num
    else:
        return None
 def test_final_letters(self):
     # Assumption is that final letters are counted as simple
     assert h.gematria(u"םןףךץ") == 280
Exemple #9
0
 def get_upper_range(html_fragment):
     soup = html_fragment
     m = re.match(
         u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?([\u05d0-\u05ea]{0,2})\)', chunk)
     return gematria(m.group(1))
Exemple #10
0
 def test_with_nikkud(self):
     assert h.gematria('הַדְּבָרִים אֲשֶׁר') == 501 + 261
Exemple #11
0
 def test_punctuation(self):
     assert h.gematria("אבגדהוזחטיכלמנסעפקרשת") == h.gematria(
         "אב[]גדהוז{}()?!ח..,,טיכלמנס    - -עפקרשת")
Exemple #12
0
 def test_simple_gematria(self):
     assert h.gematria("צדיק") == 204
     assert h.gematria("צדיק גמור") == 204 + 249
     assert h.gematria("אבגדהוזחטיכלמנסעפצקרשת") == 1000 + 450 + 45
Exemple #13
0
 def test_final_letters(self):
     # Assumption is that final letters are counted as simple
     assert h.gematria("םןףךץ") == 280
Exemple #14
0
 def get_mishna(html_fragment):
     soup = html_fragment
     if contains_range(soup):
         return unicode(gematria(re.search(u'משניות (.*)-.*', soup.text).group(1)))
     return unicode(gematria(re.search(u'משנה (.*)', soup.text).group(1)))
 def test_punctuation(self):
     assert h.gematria(u"אבגדהוזחטיכלמנסעפקרשת") == h.gematria(u"אב[]גדהוז{}()?!ח..,,טיכלמנס    - -עפקרשת")
Exemple #16
0
def open_csv():
    sources = []
    curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, curr_source, prev_rows = None, 0, None, 0, 0, None, []
    with open("sefer_haagada.csv", "rb") as fin:
        csv = unicodecsv.DictReader(fin)
        for row in csv:
            # if len(sources) >= 30:
            #     break
            if len(row["sourceNum"]) > 0:
                if len(prev_rows) > 0:
                    sources += [
                        make_parsed_source(curr_chapter_name, curr_chapter_num,
                                           curr_topic_name, curr_topic_num,
                                           curr_source_num, prev_rows)
                    ]
                    prev_rows = []
            # update chapter
            new_chapter = update_chapter(row)
            if new_chapter:
                if new_chapter[1] != curr_chapter_num + 1:
                    print("{} <= {} {}".format(new_chapter[1],
                                               curr_chapter_num,
                                               curr_chapter_name))
                curr_chapter_name, curr_chapter_num = new_chapter
                curr_topic_num = 0
                curr_source_num = 0
            # update topic
            new_topic = update_topic(row)
            if new_topic:
                if new_topic[1] != curr_topic_num + 1:
                    print("{} <= {} {}".format(new_topic[1], curr_topic_num,
                                               curr_topic_name))
                curr_topic_name, curr_topic_num = new_topic
            # update source num
            if len(row["sourceNum"]) > 0:
                new_source_num = gematria(row["sourceNum"])
                if new_source_num != curr_source_num + 1:
                    print("yoyoyo {} <= {} {} -- {}".format(
                        new_source_num, curr_source_num, curr_topic_name,
                        curr_topic_num))
                curr_source_num = new_source_num

            prev_rows += [row]
        if len(prev_rows) > 0:
            sources += [
                make_parsed_source(curr_chapter_name, curr_chapter_num,
                                   curr_topic_name, curr_topic_num,
                                   curr_source_num, prev_rows)
            ]
    sources = [_f for _f in sources if _f]
    sources = disambiguate_all(sources)
    with open("parsed.csv", "wb") as fout:
        csv = unicodecsv.DictWriter(fout, [
            "chapter_name", "chapter_num", "topic_name", "topic_num",
            "source_num", "source", "commentary", "good_ref_list",
            "bad_ref_list", "ref_list"
        ])
        csv.writeheader()
        for s in sources:
            s["ref_list"] = ", ".join(
                [r.normal() for r in s.get("ref_list", [])])
            s["good_ref_list"] = ", ".join(
                [r.normal() for r in s.get("good_ref_list", [])])
            s["bad_ref_list"] = ", ".join(
                [r.normal() for r in s.get("bad_ref_list", [])])
        csv.writerows(sources)
    with open("topics.csv", "wb") as fout:
        unique_topics = [{
            "chapter_name": x["chapter_name"],
            "topic_name": x["topic_name"]
        } for x in reduce(
            lambda a, b: a + ([b] if (len(a) == 0 or a[-1]['topic_name'] != b[
                'topic_name']) else []), sources, [])]
        csv = unicodecsv.DictWriter(fout, ["chapter_name", "topic_name"])
        csv.writeheader()
        csv.writerows(unique_topics)
 def test_simple_gematria(self):
     assert h.gematria(u"צדיק") == 204
     assert h.gematria(u"צדיק גמור") == 204 + 249
     assert h.gematria(u"אבגדהוזחטיכלמנסעפצקרשת") == 1000 + 450 + 45
Exemple #18
0
    def _extract_important_data(self):

        chapters, verses, text = {}, {}, []
        chapter, verse = 1, None

        for page in os.listdir(onePath):
            page = unicode(page)
            if page.startswith('.'):
                continue
            if verse and text:
                verses[verse] = text
                text = []
            verse = gematria(
                re.search(
                    u'\u05d1\u05e8\u05d0\u05e9\u05d9\u05ea \u05d0 ([\u05d0-\u05ea]{1,2})',
                    page).group(1))
            infile = io.open(onePath + page, 'r')
            soup = BeautifulSoup(infile, 'html5lib')
            infile.close()

            for p in soup.find_all('p'):
                if self.useless(p) or self.contains_chapter(p):
                    continue

                if self.is_question(p):
                    chunk = p.text.replace(u'\xa0\xa0', '').strip()
                    chunk = re.sub(
                        u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)',
                        u'<b>שאלות: </b>', chunk)
                    if chunk not in text:
                        text.append(chunk)
                elif self.is_footnote(p):
                    chunk = p.text.strip()
                    m = re.search(u'\[([\u05d0-\u05ea])\](.*)', chunk)
                    for segment in text:
                        if re.search(u'(\[' + m.group(1) + u'\])', segment):
                            i = text.index(segment)
                            text[i] = re.sub(
                                u'(\[' + m.group(1) + u'\])',
                                u'<sup>*</sup><i class="footnote">' +
                                m.group(2) + u'</i>', text[i])
                else:
                    chunk = re.sub(
                        u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)',
                        u'', p.text.strip())
                    if self.contains_DM(p):
                        psuqim = p.find_all('span')
                        for psuq in psuqim:
                            chunk = re.sub(
                                u'\"\s?' + psuq.text + u'\s?\"',
                                ur'<strong>' + psuq.text + u'</strong>', chunk)
                    if chunk not in text:
                        text.append(chunk)

        verses[verse] = text
        text = []
        chapters[1] = verses
        verses = {}
        chapter = 2

        for page in os.listdir(path):
            print page
            cur_verse = 1
            if page.startswith('.'):
                continue
            infile = io.open(path + page, 'r')
            soup = BeautifulSoup(infile, 'html5lib')
            infile.close()

            for p in soup.find_all('p'):
                if self.useless(p):
                    continue

                if self.contains_chapter(p):
                    chapter = self.get_chapter(p)
                    continue

                if self.contains_verse(p):
                    new_verse = self.get_verse(p)
                    if cur_verse != new_verse and text:
                        verses[cur_verse] = text
                        text = []
                        cur_verse = new_verse

                if self.is_question(p):
                    chunk = p.text.replace(u'\xa0\xa0', '').strip()
                    chunk = re.sub(
                        u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)',
                        u'<b>שאלות: </b>', chunk)
                    if chunk not in text:
                        text.append(chunk)
                elif self.is_footnote(p):
                    chunk = p.text.strip()
                    m = re.search(u'\[([\u05d0-\u05ea])\](.*)', chunk)
                    for segment in text:
                        if re.search(u'(\[' + m.group(1) + u'\])', segment):
                            i = text.index(segment)
                            text[i] = re.sub(
                                u'(\[' + m.group(1) + u'\])',
                                u'<sup>*</sup><i class="footnote">' +
                                m.group(2) + u'</i>', text[i])
                else:
                    chunk = re.sub(
                        u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)',
                        u'', p.text.strip())
                    if self.contains_DM(p):
                        psuqim = p.find_all('span')
                        for psuq in psuqim:
                            chunk = re.sub(
                                u'\"\s?' + psuq.text + u'\s?\"',
                                ur'<strong>' + psuq.text + u'</strong>', chunk)
                    if chunk not in text:
                        text.append(chunk)
            else:
                verses[cur_verse] = text
                text = []
                chapters[chapter] = verses
                verses = {}

        return chapters
Exemple #19
0
 def get_verse(html_fragment):
     soup = html_fragment
     m = re.match(
         u'\(([\u05d0-\u05ea]{1,2})(\s?-?–?\s?)[\u05d0-\u05ea]{0,2}\)',
         soup.text.strip())
     return gematria(m.group(1))
Exemple #20
0
 def get_chapter(html_fragment):
     soup = html_fragment
     m = re.search(u'· ([\u05d0-\u05ea]{1,2}) ·', soup.text)
     return gematria(m.group(1))
 def test_with_nikkud(self):
     assert h.gematria(u'הַדְּבָרִים אֲשֶׁר') == 501 + 261
Exemple #22
0
 def get_chapter(html_fragment):
     soup = html_fragment
     return unicode(gematria(re.search(u'פרק (.)', soup.text).group(1)))