def generate_refs_list(query={}): """ Generate a list of refs to all available sections. """ refs = [] counts = db.counts.find(query) for c in counts: if "title" not in c: continue # this is a category count i = texts.get_index(c["title"]) if ("error" in i): # If there is not index record to match the count record, # the count should be removed. db.counts.remove(c) continue title = c["title"] he = list_from_counts(c["availableTexts"]["he"]) en = list_from_counts(c["availableTexts"]["en"]) sections = union(he, en) for n in sections: if i["categories"][0] == "Talmud": n = texts.section_to_daf(int(n)) if "commentaryCategories" in i and i["commentaryCategories"][0] == "Talmud": split = n.split(":") n = ":".join([texts.section_to_daf(int(n[0]))] + split[1:]) ref = "%s %s" % (title, n) if n else title refs.append(ref) return refs
def get_counts_doc(text): """ Returns the stored count doc for 'text', where text is a text title, text category or list of categories. """ if isinstance(text, list): query = {"category": {"$all": text}} else: i = sefaria.get_index(text) if "error" in i: # This isn't a text title, try treating it as a category. # Look up the first text matching this category and # use its complete categories list # (e.g., "Prophets" -> ["Tanach", "Prophets"]) example = sefaria.db.index.find_one({"categories": text}) if not example: # if we don't have a single text in this category, # then we have nothing. return None # Don't use subcategories if this is a top level category if example["categories"][0] == text: query = {"$and": [{'category.0': {"$exists": False}}, {"category": text}]} else: query = {"category": {"$all": example["categories"]}} else: query = {"title": text} c = sefaria.db.counts.find_one(query) return c
def update_table_of_contents(): toc = [] # Add an entry for every text we know about indices = sefaria.db.index.find() for i in indices: del i["_id"] if i["categories"][0] == "Commentary": # Special case commentary below continue if i["categories"][0] not in order: i["categories"].insert(0, "Other") node = get_or_make_summary_node(toc, i["categories"]) text = add_counts_to_index(i) node.append(text) # Special handling to list available commentary texts which do not have # individual index records commentary_texts = sefaria.get_commentary_texts_list() for c in commentary_texts: i = sefaria.get_index(c) node = get_or_make_summary_node(toc, i["categories"]) text = add_counts_to_index(i) node.append(text) # Annotate categories nodes with counts for cat in toc: add_counts_to_category(cat) # Recursively sort categories and texts toc = sort_toc_node(toc, recur=True) save_toc(toc) return toc
def export_text(text): """ Iterates through all text documents, writing a document to disk according to formats in export_formats """ print text["title"] index = get_index(text["title"]) if "error" in index: print "Skipping %s - %s" % (text["title"], index["error"]) return text.update(index) del text["_id"] text["text"] = text.pop("chapter") export_text_doc(text)
def update_table_of_contents(): toc = [] # Add an entry for every text we know about indices = db.index.find() for i in indices: del i["_id"] if i["categories"][0] == "Commentary": # Special case commentary below continue if i["categories"][0] not in order: i["categories"].insert(0, "Other") node = get_or_make_summary_node(toc, i["categories"]) #the toc "contents" attr is returned above so for each text appends the counts and index info text = add_counts_to_index(i) node.append(text) # Special handling to list available commentary texts which do not have # individual index records commentary_texts = texts.get_commentary_texts_list() for c in commentary_texts: i = texts.get_index(c) #TODO: duplicate index records where one is a commentary and another is not labeled as one can make this crash. #this fix takes care of the crash. if len(i["categories"]) >= 1 and i["categories"][0] == "Commentary": cats = i["categories"][1:2] + ["Commentary"] + i["categories"][2:] else: cats = i["categories"][0:1] + ["Commentary"] + i["categories"][1:] node = get_or_make_summary_node(toc, cats) text = add_counts_to_index(i) node.append(text) # Annotate categories nodes with counts for cat in toc: add_counts_to_category(cat) # Recursively sort categories and texts toc = sort_toc_node(toc, recur=True) save_toc(toc) save_toc_to_db() return toc
def get_counts_doc(text): """ Returns the stored count doc for 'text', where text is a text title, text category or list of categories. """ if isinstance(text, list): query = {"category": {"$all": text}} else: i = sefaria.get_index(text) if "error" in i: # This isn't a text title, try treating it as a category. # Look up the first text matching this category and # use its complete categories list # (e.g., "Prophets" -> ["Tanach", "Prophets"]) example = sefaria.db.index.find_one({"categories": text}) if not example: # if we don't have a single text in this category, # then we have nothing. return None # Don't use subcategories if this is a top level category if example["categories"][0] == text: query = { "$and": [{ 'category.0': { "$exists": False } }, { "category": text }] } else: query = {"category": {"$all": example["categories"]}} else: query = {"title": text} c = sefaria.db.counts.find_one(query) return c
def update_summaries_on_change(ref, old_ref=None, recount=True): """ Update text summary docs to account for change or insertion of 'text' * recount - whether or not to perform a new count of available text """ global toc toc = get_toc() index = sefaria.get_index(ref) if "error" in index: return index if recount: sefaria.update_text_count(ref) resort_other = False if index["categories"][0] not in order: index["categories"].insert(0, "Other") resort_other = True node = get_or_make_summary_node(toc, index["categories"]) text = add_counts_to_index(index) found = False test_title = old_ref or text["title"] for item in node: if item.get("title") == test_title: item.update(text) found = True break if not found: node.append(text) node[:] = sort_toc_node(node) # If a new category may have been added to other, resort the cateogries if resort_other: toc[-1]["contents"] = sort_toc_node(toc[-1]["contents"]) save_toc(toc)
def update_links_count(text=None): """ Counts the links that point to a particular text, or all of them Results are stored them on the 'linksCount' field of the counts document """ if not text: counts = db.counts.find({"title": {"$exists": 1}}) for c in counts: if c["title"]: update_links_count(text=c["title"]) print "%s" % text index = texts.get_index(text) if "error" in index: return index c = { "title": text } c = db.counts.find_one(c) c["linksCount"] = db.links.find({"refs": {"$regex": texts.make_ref_re(text)}}).count() db.counts.save(c)
def update_text_count(ref, index=None): """ Update the count records of the text specfied by ref (currently at book level only) by peforming a count """ index = sefaria.get_index(ref) if "error" in index: return index c = {"title": ref} sefaria.db.counts.remove(c) if index["categories"][0] in ("Tanach", "Mishna", "Talmud"): # For these texts, consider what is present in the db across # English and Hebrew to represent actual total counts counts = count_texts(ref) if "error" in counts: return counts index["lengths"] = counts["lengths"] c["sectionCounts"] = zero_jagged_array(counts["counts"]) else: if "length" in index: index["lengths"] = [index["length"]] en = count_texts(ref, lang="en") if "error" in en: return en he = count_texts(ref, lang="he") if "error" in he: return he if "sectionCounts" in c: totals = c["sectionCounts"] else: totals = zero_jagged_array(sum_count_arrays(en["counts"], he["counts"])) enCount = sum_count_arrays(en["counts"], totals) heCount = sum_count_arrays(he["counts"], totals) c["availableTexts"] = { "en": enCount, "he": heCount, } c["availableCounts"] = { "en": en["lengths"], "he": he["lengths"], } if "length" in index: depth = len(index["lengths"]) heTotal = enTotal = total = 0 for i in range(depth): heTotal += he["lengths"][i] enTotal += en["lengths"][i] total += index["lengths"][i] if total == 0: hp = ep = 0 else: hp = heTotal / float(total) * 100 ep = enTotal / float(total) * 100 else: hp = ep = 0 c["percentAvailable"] = { "he": hp, "en": ep, } c["textComplete"] = { "he": hp > 99.9, "en": ep > 99.9, } sefaria.db.index.save(index) sefaria.db.counts.save(c) return c
def update_text_count(ref, index=None): """ Update the count records of the text specfied by ref (currently at book level only) by peforming a count """ index = sefaria.get_index(ref) if "error" in index: return index c = { "title": ref } sefaria.db.counts.remove(c) if index["categories"][0] in ("Tanach", "Mishna", "Talmud"): # For these texts, consider what is present in the db across # English and Hebrew to represent actual total counts counts = count_texts(ref) if "error" in counts: return counts c["sectionCounts"] = zero_jagged_array(counts["counts"]) en = count_texts(ref, lang="en") if "error" in en: return en he = count_texts(ref, lang="he") if "error" in he: return he if "sectionCounts" in c: totals = c["sectionCounts"] else: totals = zero_jagged_array(sum_count_arrays(en["counts"], he["counts"])) enCount = sum_count_arrays(en["counts"], totals) heCount = sum_count_arrays(he["counts"], totals) c["availableTexts"] = { "en": enCount, "he": heCount, } c["availableCounts"] = { "en": en["lengths"], "he": he["lengths"], } if "length" in index and "lengths" in index: depth = len(index["lengths"]) heTotal = enTotal = total = 0 for i in range(depth): heTotal += he["lengths"][i] enTotal += en["lengths"][i] total += index["lengths"][i] if total == 0: hp = ep = 0 else: hp = heTotal / float(total) * 100 ep = enTotal / float(total) * 100 else: hp = ep = 0 c["percentAvailable"] = { "he": hp, "en": ep, } c["textComplete"] = { "he": hp > 99.9, "en": ep > 99.9, } sefaria.db.counts.save(c) return c
connection = pymongo.Connection() db = connection[t.SEFARIA_DB] if t.SEFARIA_DB_USER and t.SEFARIA_DB_PASSWORD: db.authenticate(t.SEFARIA_DB_USER, t.SEFARIA_DB_PASSWORD) user = 28 texts = db.texts.find({"language": "he"}) text_total = {} text_order = [] for text in texts: if text['title'] not in text_total: text_total[text["title"]] = 0 text_order.append(text["title"]) print text["title"] index = t.get_index(text["title"]) if not index or not index.get("categories"): print "No index found for " + text["title"] continue if "Tanach" in index['categories']: continue talmud = True if "Talmud" in index['categories'] else False for i in range(len(text['chapter'])): if talmud: if "Bavli" in index['categories'] and i < 2: continue chap = t.section_to_daf(i + 1) else: chap = i + 1 ref = text['title'] + " " + str(chap)
def update_text_count(ref, index=None): """ Update the count records of the text specfied by ref (currently at book level only) by peforming a count """ index = texts.get_index(ref) if "error" in index: return index c = { "title": ref } existing = db.counts.find_one(c) if existing: c = existing if index["categories"][0] in ("Tanach", "Mishnah", "Talmud"): # For these texts, consider what is present in the db across # English and Hebrew to represent actual total counts counts = count_texts(ref) if "error" in counts: return counts c["sectionCounts"] = zero_jagged_array(counts["counts"]) en = count_texts(ref, lang="en") if "error" in en: return en he = count_texts(ref, lang="he") if "error" in he: return he if "sectionCounts" in c: totals = c["sectionCounts"] else: totals = zero_jagged_array(sum_count_arrays(en["counts"], he["counts"])) enCount = sum_count_arrays(en["counts"], totals) heCount = sum_count_arrays(he["counts"], totals) c["availableTexts"] = { "en": enCount, "he": heCount, } c["availableCounts"] = { "en": en["lengths"], "he": he["lengths"], } if "length" in index and "lengths" in index: depth = len(index["lengths"]) heTotal = enTotal = total = 0 for i in range(depth): heTotal += he["lengths"][i] enTotal += en["lengths"][i] total += index["lengths"][i] if total == 0: hp = ep = 0 else: hp = heTotal / float(total) * 100 ep = enTotal / float(total) * 100 #temp check to see if text has wrong metadata leading to incorrect (to high) percentage """if hp > 100: print index["title"], " in hebrew has stats out of order: ", heTotal, "/", total, "=", hp if ep > 100: print index["title"], " in english has stats out of order: ", enTotal, "/", total, "=", ep""" elif "length" in index: hp = c["availableCounts"]["he"][0] / float(index["length"]) * 100 ep = c["availableCounts"]["en"][0] / float(index["length"]) * 100 else: hp = ep = 0 c["percentAvailable"] = { "he": hp, "en": ep, } c["textComplete"] = { "he": hp > 99.9, "en": ep > 99.9, } #function to estimate how much of a text we have c['estimatedCompleteness'] = { "he" : estimate_completeness('he', index, c), "en" : estimate_completeness('en', index, c) } db.counts.save(c) return c