def export_stats_by_ref(refs, filename): with open("%s/files/%s" % (STATICFILES_DIRS[0], filename), 'wb') as csvfile: writer = csv.writer(csvfile) writer.writerow([ "ref", "letters", "words", "verses", "gematria", "gematriaHex", "gematria18Hex", ]) for ref in refs: print(ref) oRef = Ref(ref) text = oRef.text(lang="he", vtitle="Tanach with Text Only") writer.writerow([ ref, len(text.as_string().replace(" ", "")), text.word_count(), text.verse_count(), gematria(text.as_string()), "{0:#0{1}x}".format(gematria(text.as_string()), 8).replace("0x", "#"), "{0:#0{1}x}".format(gematria(text.as_string()) * 18, 8).replace("0x", "#"), ])
def export_stats_by_ref(refs, filename): with open("%s/files/%s" % (STATICFILES_DIRS[0], filename), 'wb') as csvfile: writer = csv.writer(csvfile) writer.writerow([ "ref", "letters", "words", "verses", "gematria", "gematriaHex", "gematria18Hex", ]) for ref in refs: print ref oRef = Ref(ref) text = oRef.text(lang="he", vtitle="Tanach with Text Only") writer.writerow([ ref, len(text.as_string().replace(" ", "")), text.word_count(), text.verse_count(), gematria(text.as_string()), "{0:#0{1}x}".format(gematria(text.as_string()),8).replace("0x", "#"), "{0:#0{1}x}".format(gematria(text.as_string()) * 18,8).replace("0x", "#"), ])
def export_stats_by_ref(refs, filename): with open("%s/misc/%s" % (SEFARIA_EXPORT_PATH, filename), 'wb') as csvfile: writer = csv.writer(csvfile) writer.writerow([ "Ref", "Letters", "Words", "Verses", "Gematria", "Gematria Hex", "Gematria x 18 Hex", ]) for ref in refs: print ref oRef = Ref(ref) text = oRef.text(lang="he", vtitle="Tanach with Text Only") writer.writerow([ ref, len(text.as_string().replace(" ", "")), text.word_count(), text.verse_count(), gematria(text.as_string()), "{0:#0{1}x}".format(gematria(text.as_string()),8).replace("0x", "#"), "{0:#0{1}x}".format(gematria(text.as_string()) * 18,8).replace("0x", "#"), ])
def get_location(html_fragment): soup = html_fragment if re.search(u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup): m = re.search(u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup) return gematria(m.group(1)) else: m = re.match(u'@01פרק\s(.{1,2})', soup) return gematria(m.group(1))
def get_location(html_fragment): soup = html_fragment if re.search(u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup): m = re.search( u'\{([\u05d0-\u05ea]{1,2})-?–?[\u05d0-\u05ea]{0,2}\}', soup) return gematria(m.group(1)) else: m = re.match(u'@01פרק\s(.{1,2})', soup) return gematria(m.group(1))
def update_topic(row): if len(row["topic"]) > 0: m = re.match(r"([\u05d0-\u05ea]{1,4})\. (.+)$", row["topic"]) if m is None: print(row) return None topic_num = gematria(m.group(1)) topic_name = m.group(2) return topic_name, topic_num else: return None
def update_chapter(row): if len(row["chapter"]) > 0: m = re.match(r"\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,3}) (.+)$", row["chapter"]) if m is None: print(row) return None chapter_num = gematria(m.group(1)) chapter_name = m.group(2) return chapter_name, chapter_num else: return None
def test_final_letters(self): # Assumption is that final letters are counted as simple assert h.gematria(u"םןףךץ") == 280
def get_upper_range(html_fragment): soup = html_fragment m = re.match( u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?([\u05d0-\u05ea]{0,2})\)', chunk) return gematria(m.group(1))
def test_with_nikkud(self): assert h.gematria('הַדְּבָרִים אֲשֶׁר') == 501 + 261
def test_punctuation(self): assert h.gematria("אבגדהוזחטיכלמנסעפקרשת") == h.gematria( "אב[]גדהוז{}()?!ח..,,טיכלמנס - -עפקרשת")
def test_simple_gematria(self): assert h.gematria("צדיק") == 204 assert h.gematria("צדיק גמור") == 204 + 249 assert h.gematria("אבגדהוזחטיכלמנסעפצקרשת") == 1000 + 450 + 45
def test_final_letters(self): # Assumption is that final letters are counted as simple assert h.gematria("םןףךץ") == 280
def get_mishna(html_fragment): soup = html_fragment if contains_range(soup): return unicode(gematria(re.search(u'משניות (.*)-.*', soup.text).group(1))) return unicode(gematria(re.search(u'משנה (.*)', soup.text).group(1)))
def test_punctuation(self): assert h.gematria(u"אבגדהוזחטיכלמנסעפקרשת") == h.gematria(u"אב[]גדהוז{}()?!ח..,,טיכלמנס - -עפקרשת")
def open_csv(): sources = [] curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, curr_source, prev_rows = None, 0, None, 0, 0, None, [] with open("sefer_haagada.csv", "rb") as fin: csv = unicodecsv.DictReader(fin) for row in csv: # if len(sources) >= 30: # break if len(row["sourceNum"]) > 0: if len(prev_rows) > 0: sources += [ make_parsed_source(curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, prev_rows) ] prev_rows = [] # update chapter new_chapter = update_chapter(row) if new_chapter: if new_chapter[1] != curr_chapter_num + 1: print("{} <= {} {}".format(new_chapter[1], curr_chapter_num, curr_chapter_name)) curr_chapter_name, curr_chapter_num = new_chapter curr_topic_num = 0 curr_source_num = 0 # update topic new_topic = update_topic(row) if new_topic: if new_topic[1] != curr_topic_num + 1: print("{} <= {} {}".format(new_topic[1], curr_topic_num, curr_topic_name)) curr_topic_name, curr_topic_num = new_topic # update source num if len(row["sourceNum"]) > 0: new_source_num = gematria(row["sourceNum"]) if new_source_num != curr_source_num + 1: print("yoyoyo {} <= {} {} -- {}".format( new_source_num, curr_source_num, curr_topic_name, curr_topic_num)) curr_source_num = new_source_num prev_rows += [row] if len(prev_rows) > 0: sources += [ make_parsed_source(curr_chapter_name, curr_chapter_num, curr_topic_name, curr_topic_num, curr_source_num, prev_rows) ] sources = [_f for _f in sources if _f] sources = disambiguate_all(sources) with open("parsed.csv", "wb") as fout: csv = unicodecsv.DictWriter(fout, [ "chapter_name", "chapter_num", "topic_name", "topic_num", "source_num", "source", "commentary", "good_ref_list", "bad_ref_list", "ref_list" ]) csv.writeheader() for s in sources: s["ref_list"] = ", ".join( [r.normal() for r in s.get("ref_list", [])]) s["good_ref_list"] = ", ".join( [r.normal() for r in s.get("good_ref_list", [])]) s["bad_ref_list"] = ", ".join( [r.normal() for r in s.get("bad_ref_list", [])]) csv.writerows(sources) with open("topics.csv", "wb") as fout: unique_topics = [{ "chapter_name": x["chapter_name"], "topic_name": x["topic_name"] } for x in reduce( lambda a, b: a + ([b] if (len(a) == 0 or a[-1]['topic_name'] != b[ 'topic_name']) else []), sources, [])] csv = unicodecsv.DictWriter(fout, ["chapter_name", "topic_name"]) csv.writeheader() csv.writerows(unique_topics)
def test_simple_gematria(self): assert h.gematria(u"צדיק") == 204 assert h.gematria(u"צדיק גמור") == 204 + 249 assert h.gematria(u"אבגדהוזחטיכלמנסעפצקרשת") == 1000 + 450 + 45
def _extract_important_data(self): chapters, verses, text = {}, {}, [] chapter, verse = 1, None for page in os.listdir(onePath): page = unicode(page) if page.startswith('.'): continue if verse and text: verses[verse] = text text = [] verse = gematria( re.search( u'\u05d1\u05e8\u05d0\u05e9\u05d9\u05ea \u05d0 ([\u05d0-\u05ea]{1,2})', page).group(1)) infile = io.open(onePath + page, 'r') soup = BeautifulSoup(infile, 'html5lib') infile.close() for p in soup.find_all('p'): if self.useless(p) or self.contains_chapter(p): continue if self.is_question(p): chunk = p.text.replace(u'\xa0\xa0', '').strip() chunk = re.sub( u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)', u'<b>שאלות: </b>', chunk) if chunk not in text: text.append(chunk) elif self.is_footnote(p): chunk = p.text.strip() m = re.search(u'\[([\u05d0-\u05ea])\](.*)', chunk) for segment in text: if re.search(u'(\[' + m.group(1) + u'\])', segment): i = text.index(segment) text[i] = re.sub( u'(\[' + m.group(1) + u'\])', u'<sup>*</sup><i class="footnote">' + m.group(2) + u'</i>', text[i]) else: chunk = re.sub( u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)', u'', p.text.strip()) if self.contains_DM(p): psuqim = p.find_all('span') for psuq in psuqim: chunk = re.sub( u'\"\s?' + psuq.text + u'\s?\"', ur'<strong>' + psuq.text + u'</strong>', chunk) if chunk not in text: text.append(chunk) verses[verse] = text text = [] chapters[1] = verses verses = {} chapter = 2 for page in os.listdir(path): print page cur_verse = 1 if page.startswith('.'): continue infile = io.open(path + page, 'r') soup = BeautifulSoup(infile, 'html5lib') infile.close() for p in soup.find_all('p'): if self.useless(p): continue if self.contains_chapter(p): chapter = self.get_chapter(p) continue if self.contains_verse(p): new_verse = self.get_verse(p) if cur_verse != new_verse and text: verses[cur_verse] = text text = [] cur_verse = new_verse if self.is_question(p): chunk = p.text.replace(u'\xa0\xa0', '').strip() chunk = re.sub( u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)', u'<b>שאלות: </b>', chunk) if chunk not in text: text.append(chunk) elif self.is_footnote(p): chunk = p.text.strip() m = re.search(u'\[([\u05d0-\u05ea])\](.*)', chunk) for segment in text: if re.search(u'(\[' + m.group(1) + u'\])', segment): i = text.index(segment) text[i] = re.sub( u'(\[' + m.group(1) + u'\])', u'<sup>*</sup><i class="footnote">' + m.group(2) + u'</i>', text[i]) else: chunk = re.sub( u'\([\u05d0-\u05ea]{1,2}\s?-?–?\s?[\u05d0-\u05ea]{0,2}\)', u'', p.text.strip()) if self.contains_DM(p): psuqim = p.find_all('span') for psuq in psuqim: chunk = re.sub( u'\"\s?' + psuq.text + u'\s?\"', ur'<strong>' + psuq.text + u'</strong>', chunk) if chunk not in text: text.append(chunk) else: verses[cur_verse] = text text = [] chapters[chapter] = verses verses = {} return chapters
def get_verse(html_fragment): soup = html_fragment m = re.match( u'\(([\u05d0-\u05ea]{1,2})(\s?-?–?\s?)[\u05d0-\u05ea]{0,2}\)', soup.text.strip()) return gematria(m.group(1))
def get_chapter(html_fragment): soup = html_fragment m = re.search(u'· ([\u05d0-\u05ea]{1,2}) ·', soup.text) return gematria(m.group(1))
def test_with_nikkud(self): assert h.gematria(u'הַדְּבָרִים אֲשֶׁר') == 501 + 261
def get_chapter(html_fragment): soup = html_fragment return unicode(gematria(re.search(u'פרק (.)', soup.text).group(1)))