def body(self): self.browse_to_ref("Shabbat 2b") assert not has_cantillation(self.get_nth_section_hebrew(1).text) assert not has_cantillation(self.get_nth_section_hebrew(1).text, False) self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert not self.is_vocalization_toggleSet_displayed() self.toggle_language_bilingual() self.browse_to_ref("Joshua 2") self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed() self.browse_to_ref("Genesis 1") self.toggle_on_text_settings() assert self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed()
def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and forms.count() == 0: del query_obj["refs"] forms = WordFormSet(query_obj) if forms.count() > 0: result = [] headword_query = [] for form in forms: for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def body(self): # changed to a book that should NEVER get cantillation self.browse_to_ref("Introductions to the Babylonian Talmud, Berakhot, Introduction to Berakhot") assert not has_cantillation(self.get_nth_section_hebrew(1).text) assert not has_cantillation(self.get_nth_section_hebrew(1).text, False) self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert not self.is_vocalization_toggleSet_displayed() self.toggle_language_bilingual() self.browse_to_ref("Joshua 2") self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed() self.browse_to_ref("Genesis 1") self.toggle_on_text_settings() assert self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed()
def body(self): self.browse_to_ref("Darashos HaRan 1") assert not has_cantillation(self.get_nth_section_hebrew(1).text) assert not has_cantillation(self.get_nth_section_hebrew(1).text, False) self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert not self.is_vocalization_toggleSet_displayed() self.browse_to_ref("Berakhot 2b") self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed() self.browse_to_ref("Joshua 2") self.toggle_on_text_settings() assert not self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed() self.browse_to_ref("Genesis 1") self.toggle_on_text_settings() assert self.is_aliyot_toggleSet_displayed() assert self.is_vocalization_toggleSet_displayed()
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a # consonantal form was supplied in the first place, this optimizes queries. input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def get_midrashic_text(text): """ Given text, removes commentary text and returns midrashic text If it determines text is all commentary, returns empty string :param text: :return: """ dash = "—" words = text.split() if len(words) == 0: return "", "" midrash_words = 0.0 filler_words = 0.0 curr_non_midrash_start = None curr_midrash_start = None in_paren = False potential_non_midrash_span = None non_midrash_spans = [] for i, w in enumerate(words): if "(" in w: in_paren = True if in_paren or re.match(r"^[^\u05d0-\u05ea]+$", w): if w.strip() == dash and curr_midrash_start is not None: potential_non_midrash_span = (curr_midrash_start, i + 1) filler_words += 1 elif has_cantillation(w, detect_vowels=True) or is_filler_midrash_word(w): if curr_non_midrash_start is not None: non_midrash_spans += [(curr_non_midrash_start, i)] curr_non_midrash_start = None potential_non_midrash_span = None if curr_midrash_start is None: curr_midrash_start = i midrash_words += 1 else: curr_midrash_start = None if curr_non_midrash_start is None: if potential_non_midrash_span is not None: potential_non_midrash_span_len = potential_non_midrash_span[ 1] - potential_non_midrash_span[0] if potential_non_midrash_span_len <= 8: non_midrash_spans += [potential_non_midrash_span] potential_non_midrash_span = None curr_non_midrash_start = i if ")" in w: in_paren = False if re.search(r"\.\s*$", w) and not re.search(r"\.\.\.\s*$", w): # period means end of potential non midrash span curr_midrash_start = None if curr_non_midrash_start is not None: non_midrash_spans += [(curr_non_midrash_start, len(words))] actual_len = len(words) - filler_words non_midrash_words = reduce(lambda a, b: a + (b[1] - b[0]), non_midrash_spans, 0) cutoff = 0.7 if actual_len < 20 else 0.8 if actual_len <= 0: return "", text if (non_midrash_words / actual_len) > cutoff: return "", text if len(non_midrash_spans) == 0: return text, "" midrashic_text = "" commentary_text = "" last_end = 0 for s, e in non_midrash_spans: midrashic_text += " ".join(words[last_end:s]) commentary_text += " ".join(words[s:e]) last_end = e midrashic_text += " ".join(words[last_end:]) return midrashic_text, commentary_text
def create_node_set(): aspaklaria_nodes = read_csv("aspaklaria_nodes.csv") # DONE final_topic_names = read_csv("final_topic_names.csv") # DONE new_topics_edges = read_csv("new_topics_edges.csv") # DONE upper_level_nodes = read_csv("upper_level_nodes.csv") # DONE tanakh_matched = read_csv("tanakh_matched.csv") # DONE tanakh_unmatched = read_csv("tanakh_unmatched.csv") # DONE tanakh_edges = read_csv("tanakh_edges.csv") # DONE edge_types = read_csv("edge_types.csv") # DONE sefer_haagada = read_csv("sefer_haagada.csv") # DONE talmud_matched = read_csv("talmud_matched.csv") # DONE talmud_unmatched = read_csv("talmud_unmatched.csv") # DONE talmud_edges = read_csv("talmud_edges.csv") # DONE source_sheets = read_csv("source_sheets.csv") # DONE source_sheets_dedup = read_csv("source_sheets_dedup.csv") # DONE halachic_edges = read_csv("halachic_edges.csv") # DONE node_set = NodeSet() Node.node_set = node_set print("START UPPER LEVEL") # UPPER LEVEL for row in upper_level_nodes: nid = row["Node"].lower() try: # already exists. just add isa edge n = node_set[nid] except KeyError: n = Node(nid, row["Node"], bfo_id=row["BFO ID"]) if len(row["isa"]) > 0: n.add_edge("is a", row["isa"].lower()) node_set[nid] = n print("START EDGE TYPES") # EDGE TYPES for row in edge_types: if len(row["Edge Inverse"]) > 0: edge_types_dict[row["Edge"]] = row["Edge Inverse"] edge_types_dict[row["Edge Inverse"]] = row["Edge"] if row["Edge"] != row["Edge Inverse"]: inverse_edge_set.add(row["Edge Inverse"]) print("START ASPAKLARIA") # ASPAKLARIA overwritten_cats = set() for row in aspaklaria_nodes: n = Node(row["Topic"], according_to=(row["According to"] if len(row["According to"]) else None)) is_cat = row['Is Category'] if len(is_cat) > 0: if is_cat in overwritten_cats: print( "Already overwrote {} to {}. now overwriting to {}".format( is_cat, node_set[is_cat].id, n.id)) node_set[is_cat] = n # reroute to n overwritten_cats.add(is_cat) else: isa = row["Is A Type Of"] if row[ "Is A Type Of"] not in upper_level_mapping else upper_level_mapping[ row["Is A Type Of"]] n.add_edge("is a", isa) if len(row["Is a Type Of (2)"]) > 0: isa2 = row["Is a Type Of (2)"] if row[ "Is a Type Of (2)"] not in upper_level_mapping else upper_level_mapping[ row["Is a Type Of (2)"]] n.add_edge("is a", isa2) node_set[row["Topic"]] = n print("START EDGES") # EDGES for row in new_topics_edges: if len(row["Topic"]) == 0 or len(row["Has Edge"]) == 0 or len( row["To Topic (Actual)"]) == 0: continue try: n = node_set[row["Topic"]] except KeyError: print("KeyError: {}".format(row["Topic"])) continue temp_edge = row["Has Edge"] if temp_edge == "alternate spelling of": # merge both nodes. usually `to topic` is the main topic try: m = node_set[row["To Topic (Actual)"]] m.alt_spell_id = n.id node_set[row["Topic"]] = m # reroute to m from now on except KeyError: print("Alt Spelling Key Error", row["To Topic (Actual)"]) else: n.add_edge(row["Has Edge"], row["To Topic (Actual)"]) print("START TANAKH UNMATCHED") # TANAKH UNMATCHED for row in tanakh_unmatched: wid = re.findall(r"Q\d+$", row["URL"])[0] n = Node(wid, row["English Name"], row["Hebrew Name"], wikidata_id=wid) n.add_edge("is a", "biblical person") node_set[wid] = n print("START TALMUD UNMATCHED") # TALMUD UNMATCHED for row in talmud_unmatched: jeLink = row["jeLink"] if len(row["jeLink"]) > 0 else None heWikiLink = row["heWikiLink"] if len(row["heWikiLink"]) > 0 else None enWikiLink = row["enWikiLink"] if len(row["enWikiLink"]) > 0 else None n = Node(row["English Name"], row["English Name"], row["Hebrew Name"], generation=row["generation"], jeLink=jeLink, heWikiLink=heWikiLink, enWikiLink=enWikiLink) n.add_edge( "is a", "mishnaic person" if row["Time Period"] == 'mishnah' else "talmudic person") node_set[row["English Name"]] = n node_set.items_by_talmud_name[row["English Name"]] = n print("START RAMBAM") # RAMBAM with codecs.open(u"{}/../rambam/rambam_topic_hierarchy.json".format(ROOT), "rb", encoding="utf8") as fin: rambam = json.load(fin) for row in rambam: rid = "RAMBAM|{}".format(row["en"]) n = Node(rid, row["en"]) for p in row["parents"]: n.add_edge("is a", "RAMBAM|{}".format(p)) if len(row["parents"]) == 0: n.add_edge("is a", "halacha") node_set[rid] = n print("START SEFER HAAGADA MATCHED") # SEFER HAAGADA MATCHED for row in sefer_haagada: if len(row["Aspaklaria Topic"].strip()) > 0: n = node_set[row["Aspaklaria Topic"]] if len(row['synonym']) > 0: n.alt_he.add(row["Topic Name"]) n.sefer_haagada_name = row["Topic Name"] node_set.items_by_sefer_haagada_name[row["Topic Name"]] = n print("START TOPIC NAMES") # TOPIC NAMES for irow, row in enumerate(final_topic_names): if irow >= RAMBAM_ROW_INDEX and len( row["English description"].strip()) > 0: n = node_set["RAMBAM|{}".format( row["English description"].strip())] elif irow >= RAMBAM_ROW_INDEX and has_cantillation(row["Topic"], detect_vowels=True): # Sefer Haagada n = Node(row["Topic"]) n.sefer_haagada_name = row["Topic"] n.add_edge("is a", row["Is A Type Of"]) if len(row["Is a Type Of (2)"].strip()) > 0: n.add_edge("is a", row["Is a Type Of (2)"]) node_set.items_by_sefer_haagada_name[row["Topic"]] = n node_set[row["Topic"]] = n else: try: n = node_set[row["Topic"]] except KeyError: continue description = u"" final_english = row["Final English Translation"] if len(row["Is Paren Good Description"]) > 0: match = re.search(r"^(.*)\(([^)]+)\)\s*$", final_english) final_english = match.group(1).strip() description = match.group(2) if len(row["According to:"]) > 0: if len(description) > 0: description += u". " description += u"Translated according to {}".format( row["According to:"]) n.en_name = final_english n.description = description n.en_transliteration = row["Final English Transliteration"] if len( row["Final English Transliteration"]) else None temp_he = row["Final Topic Name"].strip() if len(n.he_name) == 0: n.he_name = temp_he elif temp_he != n.he_name and len(temp_he) > 0: # alt title print("Adding alt he {} to {}".format(temp_he, n.he_name)) n.alt_he.add(temp_he) print("START TANAKH MATCHED") # TANAKH MATCHED for row in tanakh_matched: if len(row["Match Name"]) > 0: n = node_set[row["Name"]] alt_he = row["Match Name"] alt_en = row["Match En Name"] if len(n.en_name) == 0: n.en_name = alt_en else: n.alt_en.add(alt_en) if len(n.he_name) == 0: n.he_name = alt_he else: n.alt_he.add(alt_he) n.wikidata_id = row["Match ID"] node_set.items_by_wid[n.wikidata_id] = n print("START TALMUD MATCHED") # TALMUD MATCHED for row in talmud_matched: if len(row["Match Name En"]) > 0: n = node_set[row["Name"]] alt_he = row["Match Name 1"] alt_en = row["Match Name En"] if len(n.en_name) == 0 and alt_en != n.en_transliteration: n.en_name = alt_en elif alt_en != n.en_name and alt_en != n.en_transliteration: n.alt_en.add(alt_en) if len(n.he_name) == 0: n.he_name = alt_he elif alt_he != n.he_name: n.alt_he.add(alt_he) try: yo = node_set[row["Match Name En"]] print("{} EXISTS!!".format(row["Match Name En"])) except KeyError: pass node_set.items_by_talmud_name[row["Match Name En"]] = n print("START TANAKH EDGES") # TANAKH EDGES male_female_dict = {"female": u"נקבה", "male": u"זכר"} # manually add king of israel / judah which are relevant to tanakh edges n = Node(u"מלך יהודה", "King of Judah", u"מלך יהודה") n.add_edge("is a", u"מלך מלכות") node_set[u"מלך יהודה"] = n n = Node(u"מלך ישראל", "King of Israel", u"מלך ישראל") n.add_edge("is a", u"מלך מלכות") node_set[u"מלך ישראל"] = n for row in tanakh_edges: try: n = node_set.get_by_wid(row["ID"]) except KeyError: # for some reason doesn't exist yet. create it n = Node(row["ID"], row["Name"], row["He Name"], wikidata_id=row["ID"]) node_set[row["ID"]] = n for row in tanakh_edges: try: n = node_set.get_by_wid(row["Value ID"]) except KeyError: # for some reason doesn't exist yet. create it n = Node(row["Value ID"], row["Value"], wikidata_id=row["Value ID"]) node_set[row["Value ID"]] = n print("Created Value {}".format(row["Value ID"])) for row in tanakh_edges: n = node_set.get_by_wid(row["ID"]) value = row["Value"] if row["Edge"] == "alternate spelling of": # just add the alt title if len(n.he_name) == 0: n.he_name = value else: n.alt_he.add(value) elif row["Edge"] == "has transliteration": if len(n.en_name) == 0: n.en_name = value else: n.alt_en.add(value) else: if value in male_female_dict: to_node_id = male_female_dict[value] else: if len(row["Value ID"]) == 0: to_node_id = value else: try: to_node_id = node_set.get_by_wid(row["Value ID"]).id except KeyError: print(row["Value ID"]) continue n.add_edge(row["Edge"], to_node_id) print("START TALMUD EDGES") # TALMUD EDGES for row in talmud_edges: try: n = node_set.get_by_talmud_name(row["Name"]) except KeyError: person = Person().load({"key": row["Name"]}) n = Node(row["Name"], row["Name"], person.primary_name('he'), jeLink=getattr(person, 'jeLink', None), heWikiLink=getattr(person, 'heWikiLink', None), enWikiLink=getattr(person, 'enWikiLink', None)) node_set[row["Name"]] = n node_set.items_by_talmud_name[row["Name"]] = n try: n = node_set.get_by_talmud_name(row["Value"]) except KeyError: person = Person().load({"key": row["Value"]}) n = Node(row["Value"], row["Value"], person.primary_name('he'), jeLink=getattr(person, 'jeLink', None), heWikiLink=getattr(person, 'heWikiLink', None), enWikiLink=getattr(person, 'enWikiLink', None)) node_set[row["Value"]] = n node_set.items_by_talmud_name[row["Value"]] = n for row in talmud_edges: try: n = node_set.get_by_talmud_name(row["Name"]) except KeyError: print(row["Name"]) print("NAME") continue try: to_node = node_set.get_by_talmud_name(row["Value"]) except KeyError: print(row["Value"]) print("VALUE") continue n.add_edge(row["Edge"], to_node.id) print("START SOURCE SHEET DEDUP") """ B -> A C -> B ===> C -> A A -> [B] B -> [C] """ source_sheet_dedup_map = {} source_sheets_dedup_index_map = {} source_sheets_dedup_list = [] for irow, row in enumerate(source_sheets_dedup): source_sheets_dedup_list += [row] source_sheets_dedup_index_map[row['tag']] = irow for row in reversed(source_sheets_dedup_list): if len(row['same as']) == 0: continue if row['same as'] in source_sheet_dedup_map: alt_same_as = source_sheet_dedup_map[row['same as']] if source_sheets_dedup_index_map[ alt_same_as] < source_sheets_dedup_index_map[ row['same as']]: same_as = alt_same_as else: same_as = row['same as'] # rewrite for k, v in source_sheet_dedup_map.items(): if v == alt_same_as: if k == alt_same_as: del source_sheet_dedup_map[k] else: source_sheet_dedup_map[k] = row['same as'] source_sheet_dedup_map[alt_same_as] = row['same as'] else: same_as = row['same as'] if same_as == row['tag']: continue source_sheet_dedup_map[row['tag']] = same_as print("START SOURCE SHEETS") # SOURCE SHEETS for row in source_sheets: # if aspak -> if not synon -> else -> match it # else, if not isCat, -> is a "is a type of" else he = row["hebrew tag"] en = row["tag"] if len(row["aspaklaria topic"]) > 0 and len(row["not synonym"]) == 0: n = node_set[row["aspaklaria topic"]] if len(n.en_name) == 0 and en != n.en_transliteration: n.en_name = en elif len(en) > 0 and en != n.en_name: n.alt_en.add(en) if len(n.he_name) == 0: n.he_name = he elif len(he) > 0 and he != n.he_name: n.alt_he.add(he) n.source_sheet_tags.add(en) elif len(row["is a type of"]) > 0 or len(row["is category"]) > 0: isa = row["is a type of"] if row[ "is a type of"] not in upper_level_mapping else upper_level_mapping[ row["is a type of"]] n = node_set[isa] if row["is category"]: if len(n.en_name) == 0 and en != n.en_transliteration: n.en_name = en elif len(en) > 0 and en != n.en_name: n.alt_en.add(en) if len(n.he_name) == 0: n.he_name = he elif len(he) > 0 and he != n.he_name: n.alt_he.add(he) n.source_sheet_tags.add(en) else: if en in source_sheet_dedup_map: old_sheet_node = node_set[u"SHEET|{}".format( source_sheet_dedup_map[en])] if len(old_sheet_node.en_name ) == 0 and en != old_sheet_node.en_transliteration: old_sheet_node.en_name = en elif len(en) > 0 and en != old_sheet_node.en_name: old_sheet_node.alt_en.add(en) if len(old_sheet_node.he_name) == 0: old_sheet_node.he_name = he elif len(he) > 0 and he != old_sheet_node.he_name: old_sheet_node.alt_he.add(he) old_sheet_node.source_sheet_tags.add(en) else: # completely new topic _id = u"SHEET|{}".format(en) m = Node(_id, en, he) m.source_sheet_tags.add(en) m.add_edge("is a", n.id) node_set[_id] = m print("START HALACHIC EDGES") # HALACHIC EDGES for row in halachic_edges: if len(row["rambam topic"]) > 0: n = node_set[row["topic"]] edge_type = "applies halacha" if "halachic process" in n.get_types( ) else "related to" n.add_edge(edge_type, "RAMBAM|{}".format(row["rambam topic"].strip())) # CLEAN UP node_set.add_edge_inverses() node_set.validate() return node_set