def bulktext_api(request, refs): """ Used by the linker. :param request: :param refs: :return: """ if request.method == "GET": cb = request.GET.get("callback", None) useTextFamily = request.GET.get("useTextFamily", None) refs = set(refs.split("|")) res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" if useTextFamily: text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False) he = text_fam.he en = text_fam.text res[tref] = { 'he': he, 'en': en, 'lang': lang, 'ref': oref.normal(), 'primary_category': text_fam.contents()['primary_category'], 'heRef': oref.he_normal(), 'url': oref.url() } else: he = model.TextChunk(oref, "he").text en = model.TextChunk(oref, "en").text res[tref] = { 'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string( ), # these could be flattened on the client, if need be. 'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(), 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError, KeyError) as e: # referer = request.META.get("HTTP_REFERER", "unknown page") # This chatter fills up the logs. todo: put in it's own file # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} resp = jsonResponse(res, cb) return resp
def bulktext_api(request, refs): """ Used by the linker. :param request: :param refs: :return: """ if request.method == "GET": cb = request.GET.get("callback", None) refs = set(refs.split("|")) res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" he = model.TextChunk(oref, "he").text en = model.TextChunk(oref, "en").text res[tref] = { 'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(), # these could be flattened on the client, if need be. 'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(), 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError) as e: referer = request.META.get("HTTP_REFERER", "unknown page") logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} resp = jsonResponse(res, cb) resp['Access-Control-Allow-Origin'] = '*' return resp
def format_link_object_for_client(link, with_text, ref, pos=None): """ :param link: Link object :param ref: Ref object of the source of the link :param pos: Optional position of the Ref in the Link. If not passed, it will be derived from the first two arguments. :return: Dict """ com = {} # The text we're asked to get links to anchorRef = Ref(link.refs[pos]) # The link we found to anchorRef linkRef = Ref(link.refs[(pos + 1) % 2]) com["_id"] = str(link._id) com['index_title'] = linkRef.index.title com["category"] = linkRef.primary_category #usually the index's categories[0] or "Commentary". com["type"] = link.type com["ref"] = linkRef.tref com["anchorRef"] = anchorRef.normal() com["sourceRef"] = linkRef.normal() com["sourceHeRef"] = linkRef.he_normal() com["anchorVerse"] = anchorRef.sections[-1] if len(anchorRef.sections) else 0 com["anchorText"] = getattr(link, "anchorText", "") # Pad out the sections list, so that comparison between comment numbers are apples-to-apples lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections)) # Build a decimal comment number based on the last two digits of the section array com["commentaryNum"] = lsections[-1] if len(lsections) == 1 \ else float('{0}.{1:04d}'.format(*lsections[-2:])) if len(lsections) > 1 else 0 if with_text: text = TextFamily(linkRef, context=0, commentary=False) com["text"] = text.text if isinstance(text.text, basestring) else JaggedTextArray(text.text).flatten_to_array() com["he"] = text.he if isinstance(text.he, basestring) else JaggedTextArray(text.he).flatten_to_array() # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi") # this is now simpler, and there is explicit data on the index record for it. if com["type"] == "commentary": com["linkGroupTitle"] = { 'en': getattr(linkRef.index, 'collective_title', linkRef.index.title), 'he': hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he"))) } com["commentator"] = getattr(linkRef.index, 'collective_title', linkRef.index.title) # TODO: deprecate com["heCommentator"] = hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he"))) # TODO: deprecate else: com["linkGroupTitle"] = {'en': linkRef.index.title, 'he': linkRef.index.get_title("he")} com["commentator"] = linkRef.index.title # TODO: deprecate com["heCommentator"] = linkRef.index.get_title("he") # TODO: deprecate if com["type"] != "commentary" and com["category"] == "Commentary": com["category"] = "Quoting Commentary" #add a fix here for quoting commentary appearing together with commentary in s2 panels if linkRef.index_node.primary_title("he"): com["heTitle"] = linkRef.index_node.primary_title("he") return com
def __init__(self, **kwargs): self.commentator = kwargs['en_title'] self.he_commentator = kwargs['he_title'] self.book = kwargs['book'] self.he_book = kwargs['he_book'] self.book_id = kwargs['bid'] self.hilchot_id = kwargs['HilchotId'] self.ja = JaggedTextArray()
def format_link_object_for_client(link, with_text, ref, pos=None): """ :param link: Link object :param ref: Ref object of the source of the link :param pos: Optional position of the Ref in the Link. If not passed, it will be derived from the first two arguments. :return: Dict """ com = {} # The text we're asked to get links to anchorRef = Ref(link.refs[pos]) # The link we found to anchorRef linkRef = Ref(link.refs[(pos + 1) % 2]) com["_id"] = str(link._id) com['index_title'] = linkRef.index.title com["category"] = linkRef.type com["type"] = link.type com["ref"] = linkRef.tref com["anchorRef"] = anchorRef.normal() com["sourceRef"] = linkRef.normal() com["sourceHeRef"] = linkRef.he_normal() com["anchorVerse"] = anchorRef.sections[-1] if len( anchorRef.sections) else 0 com["commentaryNum"] = linkRef.sections[ -1] if linkRef.type == "Commentary" else 0 com["anchorText"] = getattr(link, "anchorText", "") if com["category"] in REORDER_RULES: com["category"] = REORDER_RULES[com["category"]][0] if with_text: text = TextFamily(linkRef, context=0, commentary=False) com["text"] = JaggedTextArray(text.text).flatten_to_array() com["he"] = JaggedTextArray(text.he).flatten_to_array() # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi") if com["type"] == "commentary": com["commentator"] = linkRef.book.split(" on ")[0] com["heCommentator"] = linkRef.he_book().split(u" על ")[0] else: if com["category"] == "Commentary": com["category"] = "Quoting Commentary" com["commentator"] = linkRef.index.title com["heCommentator"] = linkRef.index.get_title( "he") if linkRef.index.get_title("he") else com["commentator"] if link.type == "targum": com["category"] = "Targum" if linkRef.index_node.primary_title("he"): com["heTitle"] = linkRef.index_node.primary_title("he") return com
def format_link_object_for_client(link, with_text, ref, pos=None): """ :param link: Link object :param ref: Ref object of the source of the link :param pos: Optional position of the Ref in the Link. If not passed, it will be derived from the first two arguments. :return: Dict """ com = {} # The text we're asked to get links to anchorRef = Ref(link.refs[pos]) # The link we found to anchorRef linkRef = Ref(link.refs[(pos + 1) % 2]) com["_id"] = str(link._id) com["category"] = linkRef.type com["type"] = link.type com["ref"] = linkRef.tref com["anchorRef"] = anchorRef.normal() com["sourceRef"] = linkRef.normal() com["sourceHeRef"] = linkRef.he_normal() com["anchorVerse"] = anchorRef.sections[-1] com["commentaryNum"] = linkRef.sections[ -1] if linkRef.type == "Commentary" else 0 com["anchorText"] = getattr(link, "anchorText", "") if with_text: #from sefaria.texts import get_text #text = get_text(linkRef.normal(), context=0, commentary=False) text = TextFamily(linkRef, context=0, commentary=False) #com["text"] = text["text"] if text["text"] else "" #com["he"] = text["he"] if text["he"] else "" com["text"] = JaggedTextArray(text.text).flatten_to_array() com["he"] = JaggedTextArray(text.he).flatten_to_array() # strip redundant verse ref for commentators # if the ref we're looking for appears exactly in the commentary ref, strip redundant info #todo: this comparison - ref in linkRef.normal() - seems brittle. Make it rigorous. if com["category"] == "Commentary" and ref in linkRef.normal(): com["commentator"] = linkRef.index.commentator com["heCommentator"] = linkRef.index.heCommentator if getattr( linkRef.index, "heCommentator", None) else com["commentator"] else: com["commentator"] = linkRef.book com["heCommentator"] = linkRef.index_node.primary_title( "he") if linkRef.index_node.primary_title( "he") else com["commentator"] if linkRef.index_node.primary_title("he"): com["heTitle"] = linkRef.index_node.primary_title("he") return com
def content_node_merger(snode, *contents, **kwargs): """ :param snode: SchemaContentNode :param contents: Length two array of content. Second is merged into first and returned. :param kwargs: "sources": array of source names :return: """ assert len(contents) == 2 if warn and JaggedTextArray(contents[0]).overlaps(JaggedTextArray(contents[1])): raise Exception("WARNING - overlapping content in {}".format(snode.full_title())) merged_text, sources = merge_texts([contents[0], contents[1]], kwargs.get("sources")) return merged_text
def bundle_many_texts(refs, useTextFamily=False, as_sized_string=False, min_char=None, max_char=None): res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" if useTextFamily: text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False) he = text_fam.he en = text_fam.text res[tref] = { 'he': he, 'en': en, 'lang': lang, 'ref': oref.normal(), 'primary_category': text_fam.contents()['primary_category'], 'heRef': oref.he_normal(), 'url': oref.url() } else: he_tc = model.TextChunk(oref, "he") en_tc = model.TextChunk(oref, "en") if as_sized_string: kwargs = {} if min_char: kwargs['min_char'] = min_char if max_char: kwargs['max_char'] = max_char he_text = he_tc.as_sized_string(**kwargs) en_text = en_tc.as_sized_string(**kwargs) else: he = he_tc.text en = en_tc.text # these could be flattened on the client, if need be. he_text = he if isinstance(he, str) else JaggedTextArray(he).flatten_to_string() en_text = en if isinstance(en, str) else JaggedTextArray(en).flatten_to_string() res[tref] = { 'he': he_text, 'en': en_text, 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError, KeyError) as e: # referer = request.META.get("HTTP_REFERER", "unknown page") # This chatter fills up the logs. todo: put in it's own file # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} return res
def do_copy(self): self.load_objects() if self._post_index: if isinstance(self._index_obj, CommentaryIndex): idx_contents = self._index_obj.c_index.contents(raw=True) idx_title = self._index_obj.c_index.title elif isinstance(self._index_obj, Index): idx_contents = self._index_obj.contents(raw=True) idx_title = self._index_obj.title self._make_post_request_to_server( self._prepare_index_api_call(idx_title), idx_contents) content_nodes = self._index_obj.nodes.get_leaf_nodes() for ver in self._version_objs: found_non_empty_content = False print ver.versionTitle.encode('utf-8') flags = {} for flag in ver.optional_attrs: if hasattr(ver, flag): flags[flag] = getattr(ver, flag, None) for node in content_nodes: print node.full_title(force_update=True) text = JaggedTextArray(ver.content_node(node)).array() version_payload = { "versionTitle": ver.versionTitle, "versionSource": ver.versionSource, "language": ver.language, "text": text } if len(text) > 0: # only bother posting nodes that have content. found_non_empty_content = True self._make_post_request_to_server( self._prepare_text_api_call( node.full_title(force_update=True)), version_payload) if not found_non_empty_content: # post the last node again with dummy text, to make sure an actual version db object is created # then post again to clear the dummy text dummy_text = "This is a dummy text" for _ in range(node.depth): dummy_text = [dummy_text] version_payload['text'] = dummy_text self._make_post_request_to_server( self._prepare_text_api_call(node.full_title()), version_payload) version_payload['text'] = [] self._make_post_request_to_server( self._prepare_text_api_call(node.full_title()), version_payload) if flags: self._make_post_request_to_server( self._prepare_version_attrs_api_call( ver.title, ver.language, ver.versionTitle), flags) if self._post_links: links = [ l.contents() for l in self._linkset if not getattr(l, 'source_text_oid', None) ] self._make_post_request_to_server(self._prepare_links_api_call(), links)
def do_copy(self): self.load_objects() if self._post_index: pdb.set_trace() if isinstance(self._index_obj, CommentaryIndex): idx_contents = self._index_obj.c_index.contents(raw=True) idx_title = self._index_obj.c_index.title elif isinstance(self._index_obj, Index): idx_contents = self._index_obj.contents(raw=True) idx_title = self._index_obj.title self._make_post_request_to_server(self._prepare_index_api_call(idx_title), idx_contents) content_nodes = self._index_obj.nodes.get_leaf_nodes() pdb.set_trace() for ver in self._version_objs: print ver.versionTitle.encode('utf-8') flags = {} for flag in ver.optional_attrs: if hasattr(ver, flag): flags[flag] = getattr(ver, flag, None) for node in content_nodes: #print node.full_title(force_update=True) text = JaggedTextArray(ver.content_node(node)).array() version_payload = { "versionTitle": ver.versionTitle, "versionSource": ver.versionSource, "language": ver.language, "text": text } self._make_post_request_to_server(self._prepare_text_api_call(node.full_title(force_update=True)), version_payload) if flags: self._make_post_request_to_server(self._prepare_version_attrs_api_call(ver.title, ver.language, ver.versionTitle), flags) if self._post_links: links = [l.contents() for l in self._linkset] self._make_post_request_to_server(self._prepare_links_api_call(), links)
def _node_count(self, snode, lang="en"): """ Count available versions of a text in the db, segment by segment. :return counts: :type return: JaggedIntArray """ counts = JaggedIntArray() versions = self.versions(lang) for version in versions: raw_text_ja = version.content_node(snode) ja = JaggedTextArray(raw_text_ja) mask = ja.mask() counts = counts + mask return counts
def split_text_section(oref, lang, old_version_title, new_version_title): """ Splits the text in `old_version_title` so that the content covered by `oref` now appears in `new_version_title`. Rewrites history for affected content. NOTE: `oref` cannot be ranging (until we implement saving ranging refs on TextChunk). Spanning refs are handled recursively. """ if oref.is_spanning(): for span in oref.split_spanning_ref(): split_text_section(span, lang, old_version_title, new_version_title) return old_chunk = TextChunk(oref, lang=lang, vtitle=old_version_title) new_chunk = TextChunk(oref, lang=lang, vtitle=new_version_title) # Copy content to new version new_chunk.versionSource = old_chunk.version().versionSource new_chunk.text = old_chunk.text new_chunk.save() # Rewrite History ref_regex_queries = [{"ref": {"$regex": r}, "version": old_version_title, "language": lang} for r in oref.regex(as_list=True)] query = {"$or": ref_regex_queries} db.history.update(query, {"$set": {"version": new_version_title}}, upsert=False, multi=True) # Remove content from old version old_chunk.text = JaggedTextArray(old_chunk.text).constant_mask(constant="").array() old_chunk.save()
def test_modify_text_by_function(): original = TextChunk(Ref("Job"), vtitle="The Holy Scriptures: A New Translation (JPS 1917)") total_spaces = JaggedTextArray(original.text).flatten_to_string(joiner="|").count(" ") v = Version({ "language": "en", "title": "Job", "versionSource": "http://foobar.com", "versionTitle": "TextChangeTest", "chapter": original.text }).save() modify_text_by_function("Job", "TextChangeTest", "en", lambda x, sections: x.replace(" ", "$"), 23432) modified = TextChunk(Ref("Job"), vtitle="TextChangeTest") total_dollars = JaggedTextArray(modified.text).flatten_to_string(joiner="|").count("$") v.delete() assert total_dollars > 0 assert total_spaces == total_dollars
def resize_text(title, new_structure, upsize_in_place=False): # todo: Needs to be converted to objects, but no usages seen in the wild. """ Change text structure for text named 'title' to 'new_structure' (a list of strings naming section names) Changes index record as well as restructuring any text that is currently saved. When increasing size, any existing text will become the first segment of the new level ["One", "Two", "Three"] -> [["One"], ["Two"], ["Three"]] If upsize_in_place==True, existing text will stay in tact, but be wrapped in new depth: ["One", "Two", "Three"] -> [["One", "Two", "Three"]] When decreasing size, information is lost as any existing segments are concatenated with " " [["One1", "One2"], ["Two1", "Two2"], ["Three1", "Three2"]] - >["One1 One2", "Two1 Two2", "Three1 Three2"] """ index = db.index.find_one({"title": title}) if not index: return False old_structure = index["sectionNames"] index["sectionNames"] = new_structure db.index.save(index) delta = len(new_structure) - len(old_structure) if delta == 0: return True texts = db.texts.find({"title": title}) for text in texts: if delta > 0 and upsize_in_place: resized = text["chapter"] for i in range(delta): resized = [resized] else: resized = JaggedTextArray(text["chapter"]).resize(delta).array() text["chapter"] = resized db.texts.save(text) # TODO Rewrite any existing Links # TODO Rewrite any exisitng History items summaries.update_summaries_on_change(title) scache.reset_texts_cache() return True
def count_completed_translation_requests(): """ Returns stats about completed translation requests. """ featured = 0 words = 0 sct_words = 0 featured_words = 0 featured_sct_words = 0 trs = TranslationRequestSet({"completed": True}) count = trs.count() for tr in trs: oref = text.Ref(tr.ref) t = oref.text().text is_sct = not oref.text().is_merged and oref.text().version( ) and oref.text().version( ).versionTitle == "Sefaria Community Translation" n = JaggedTextArray(t).word_count() words += n sct_words += n if is_sct else 0 if getattr(tr, "featured", False): featured += 1 featured_words += n featured_sct_words += n if is_sct else 0 out = "%d total translation requests completed.\n" % count out += "%d total words of translation added.\n" % words out += "%d total words of translation created.\n" % sct_words out += "******\n" out += "%d featured translation requests completed.\n" % featured out += "%d total words of translation added from featured requests.\n" % featured_words out += "%d total words of translation created from featured requests.\n" % featured_sct_words return out
def word_count(self): return JaggedTextArray(self.as_strings()).word_count()
# -*- coding: utf-8 -*- import argparse import re from sefaria.model import * from sefaria.datatype.jagged_array import JaggedTextArray, JaggedArray from sefaria.system.exceptions import BookNameError all_versions = VersionSet() #all_library_nodes = library.get_content_nodes(with_commentary=True) for version in all_versions: print "{}: {}".format(version.title.encode('utf-8'), version.versionTitle.encode('utf-8')) version_altered = False try: idx = version.get_index() content_nodes = idx.nodes.get_leaf_nodes() for node in content_nodes: print node ja_text = JaggedTextArray(version.content_node(node)) normalized = ja_text.normalize(terminal_depth=node.depth) if normalized: #only set things that were changed. version.sub_content(key_list=node.version_address(), value=ja_text.array()) version_altered = True if version_altered: #only go through save if something actually changed version.save() except BookNameError as e: print "no index for {}".format(version.title.encode('utf-8'))
class Commentary: def __init__(self, **kwargs): self.commentator = kwargs['en_title'] self.he_commentator = kwargs['he_title'] self.book = kwargs['book'] self.he_book = kwargs['he_book'] self.book_id = kwargs['bid'] self.hilchot_id = kwargs['HilchotId'] self.ja = JaggedTextArray() @classmethod def build_from_row(cls, row): if row['HilchotId'] == 2: raise CommentaryError en_title, he_title = cls.book_titles_from_row(row) init_args = { 'en_title': row['en_title'], 'he_title': row['name'], 'book': en_title, 'he_book': he_title, 'bid': row['bid'], 'HilchotId': row['HilchotId'], } return cls(**init_args) @staticmethod def book_titles_from_row(row): mishneh_torah = 'משנה תורה' full_title = f'{mishneh_torah}, {row["Hilchot"]}' en_title = sef_obj.Ref(full_title).normal() return en_title, full_title def is_part_of_commentary(self, row): return (self.book_id, self.hilchot_id) == (row['bid'], row['HilchotId']) def add_segment(self, segment: str, indices: tuple) -> None: final_index = self.ja.sub_array_length(indices) if final_index is None: final_index = 0 indices = indices + (final_index, ) self.ja.set_element(indices, segment) def add_segments_from_row(self, row): segments = self.build_segments(row['text']) indices = self.get_indices_for_row(row) for segment in segments: self.add_segment(segment, indices) @staticmethod def get_ja(title, he_title) -> dict: ja = sef_obj.JaggedArrayNode() ja.add_primary_titles(title, he_title) ja.add_structure(['Chapter', 'Halakhah', 'Comment']) ja.validate() return ja.serialize() def generate_index(self) -> dict: title, he_title = f'{self.commentator} on {self.book}', f'{self.he_commentator} על {self.he_book}' return { 'title': title, 'categories': self.get_category(), 'dependence': 'Commentary', 'base_text_titles': [self.book], 'schema': self.get_ja(title, he_title), 'collective_title': self.commentator, 'base_text_mapping': 'many_to_one' } def build_version(self) -> dict: return { 'versionTitle': 'Friedberg Edition', 'versionSource': 'https://fjms.genizah.org', 'language': 'he', 'text': self.ja.array() } @staticmethod def build_segments(segment: str) -> list: segment_xml = '<root>{}</root>'.format(segment) segment_soup = BeautifulSoup(segment_xml, 'xml') segment_root = segment_soup.root # clear out multiple classes - we're only interested in the last letter in the class for span in segment_root.find_all('span'): klass = span.get('class', '') if klass and isinstance(klass, list): span['class'] = span['class'][-1] # consolidate duplicate tags and unwrap meaningless tags for span in segment_root.find_all('span'): previous = span.previous_sibling if not previous: continue # make sure all text inside spans end with a space, we'll remove duplicates later if span.string: span.string.replace_with( NavigableString(' {}'.format(span.string))) if span.get('class', '') == '': span.unwrap() elif span.name == previous.name and span.get( 'class') == previous.get('class'): previous.append(span) span.unwrap() # handle footnotes while True: marker = segment_root.find('span', attrs={'class': 'R'}) note_tag = segment_root.find('span', attrs={'class': 'N'}) if marker and note_tag: marker.name = 'sup' del marker['class'] note_text = note_tag.text note_text = re.sub(r'^{}\s'.format(re.escape(marker.text)), '', note_text) new_note = segment_soup.new_tag('i') new_note['class'] = 'footnote' new_note.string = note_text marker.insert_after(new_note) note_tag.decompose() else: break markup = segment_root.find_all('span', class_=re.compile('[BZS]')) for b in markup: if b['class'] == 'S': b.name = 'small' elif b['class'] == 'Z': b.name = 'quote' else: b.name = 'b' del b['class'] segment_text = segment_root.decode_contents() segment_text = re.sub(r'^\s+|\s+$', '', segment_text) segment_text = re.sub(r'\s{2,}', ' ', segment_text) segment_text = re.sub(r'\s*<br/>\s*', '<br/>', segment_text) segment_text = re.sub(r'\s*(<br/>)+$', '', segment_text) # break on quotes which immediately follow a break broken_segments = re.split(r'<br/>(?=<quote>)', segment_text) broken_segments = [ re.sub(r'quote', 'b', seg) for seg in broken_segments ] return broken_segments @staticmethod def get_indices_for_row(row) -> tuple: def adjust(value: int): return value - 1 if value > 0 else value return adjust(row['PerekId']), adjust(row['HalachaId']) def get_term_data(self) -> tuple: return self.commentator, self.he_commentator def get_category(self) -> tuple: rambam_index = sef_obj.library.get_index(self.book) return ('Halakhah', 'Mishneh Torah', 'Commentary', self.commentator, rambam_index.categories[-1])
def format_link_object_for_client(link, with_text, ref, pos=None): """ :param link: Link object :param ref: Ref object of the source of the link :param pos: Position of the Ref in the Link. If not passed, it will be derived from the first two arguments. :return: Dict """ com = {} # The text we're asked to get links to anchorTref = link.refs[pos] anchorRef = Ref(anchorTref) anchorTrefExpanded = getattr(link, "expandedRefs{}".format(pos)) # The link we found to anchorRef linkPos = (pos + 1) % 2 linkTref = link.refs[linkPos] linkRef = Ref(linkTref) langs = getattr(link, "availableLangs", [[],[]]) linkLangs = langs[linkPos] com["_id"] = str(link._id) com['index_title'] = linkRef.index.title com["category"] = linkRef.primary_category #usually the index's categories[0] or "Commentary". com["type"] = link.type com["ref"] = linkTref com["anchorRef"] = anchorTref com["anchorRefExpanded"] = anchorTrefExpanded com["sourceRef"] = linkTref com["sourceHeRef"] = linkRef.he_normal() com["anchorVerse"] = anchorRef.sections[-1] if len(anchorRef.sections) else 0 com["sourceHasEn"] = "en" in linkLangs # com["anchorText"] = getattr(link, "anchorText", "") # not currently used if getattr(link, "inline_reference", None): com["inline_reference"] = getattr(link, "inline_reference", None) if getattr(link, "highlightedWords", None): com["highlightedWords"] = getattr(link, "highlightedWords", None) compDate = getattr(linkRef.index, "compDate", None) if compDate: com["compDate"] = int(compDate) try: com["errorMargin"] = int(getattr(linkRef.index, "errorMargin", 0)) except ValueError: com["errorMargin"] = 0 # Pad out the sections list, so that comparison between comment numbers are apples-to-apples lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections)) # Build a decimal comment number based on the last two digits of the section array com["commentaryNum"] = lsections[-1] if len(lsections) == 1 \ else float('{0}.{1:04d}'.format(*lsections[-2:])) if len(lsections) > 1 else 0 if with_text: text = TextFamily(linkRef, context=0, commentary=False) com["text"] = text.text if isinstance(text.text, basestring) else JaggedTextArray(text.text).flatten_to_array() com["he"] = text.he if isinstance(text.he, basestring) else JaggedTextArray(text.he).flatten_to_array() # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi") # this is now simpler, and there is explicit data on the index record for it. if com["type"] == "commentary": com["collectiveTitle"] = { 'en': getattr(linkRef.index, 'collective_title', linkRef.index.title), 'he': hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he"))) } else: com["collectiveTitle"] = {'en': linkRef.index.title, 'he': linkRef.index.get_title("he")} if com["type"] != "commentary" and com["category"] == "Commentary": com["category"] = "Quoting Commentary" if com["category"] == "Modern Works" and getattr(linkRef.index, "dependence", None) == "Commentary": # print "Transforming " + linkRef.normal() com["category"] = "Modern Commentary" com["collectiveTitle"] = { 'en': getattr(linkRef.index, 'collective_title', linkRef.index.title), 'he': hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he"))) } if linkRef.index_node.primary_title("he"): com["heTitle"] = linkRef.index_node.primary_title("he") return com
def get_links(tref, with_text=True): """ Return a list of links tied to 'ref' in client format. If with_text, retrieve texts for each link. """ links = [] oref = Ref(tref) nRef = oref.normal() lenRef = len(nRef) reRef = oref.regex() if oref.is_range() else None # for storing all the section level texts that need to be looked up texts = {} linkset = LinkSet(oref) # For all links that mention ref (in any position) for link in linkset: # each link contains 2 refs in a list # find the position (0 or 1) of "anchor", the one we're getting links for if reRef: pos = 0 if re.match(reRef, link.refs[0]) else 1 else: pos = 0 if nRef == link.refs[0][:lenRef] else 1 try: com = format_link_object_for_client(link, False, nRef, pos) except InputError: # logger.warning("Bad link: {} - {}".format(link.refs[0], link.refs[1])) continue except AttributeError as e: logger.error( u"AttributeError in presenting link: {} - {} : {}".format( link.refs[0], link.refs[1], e)) continue # Rather than getting text with each link, walk through all links here, # caching text so that redundant DB calls can be minimized # If link is spanning, split into section refs and rejoin try: if with_text: original_com_oref = Ref(com["ref"]) com_orefs = original_com_oref.split_spanning_ref() for com_oref in com_orefs: top_oref = com_oref.top_section_ref() # Lookup and save top level text, only if we haven't already top_nref = top_oref.normal() if top_nref not in texts: texts[top_nref] = TextFamily(top_oref, context=0, commentary=False, pad=False).contents() for t in ["text", "he"]: texts[top_nref][t] = JaggedTextArray( texts[top_nref][t]) sections, toSections = com_oref.sections[ 1:], com_oref.toSections[1:] for t in ["text", "he"]: res = texts[top_nref][t].subarray( [i - 1 for i in sections], [i - 1 for i in toSections]).array() if t not in com: com[t] = res else: if isinstance(com[t], basestring): com[t] = [com[t]] com[t] += res ''' next_section = grab_section_from_text(sections, texts[top_nref][t], toSections) if t not in com: com[t] = next_section elif isinstance(com[t], list): if isinstance(next_section, list): com[t] += next_section else: com[t] += [next_section] else: #com[t] is string if isinstance(next_section, list): com[t] = [com[t]] + next_section else: com[t] += u" " + next_section ''' links.append(com) except NoVersionFoundError as e: logger.warning( "Trying to get non existent text for ref '{}'. Link refs were: {}" .format(top_nref, link.refs)) continue return links
def format_link_object_for_client(link, with_text, ref, pos=None): """ :param link: Link object :param ref: Ref object of the source of the link :param pos: Optional position of the Ref in the Link. If not passed, it will be derived from the first two arguments. :return: Dict """ com = {} # The text we're asked to get links to anchorRef = Ref(link.refs[pos]) # The link we found to anchorRef linkRef = Ref(link.refs[(pos + 1) % 2]) com["_id"] = str(link._id) com['index_title'] = linkRef.index.title com["category"] = linkRef.type com["type"] = link.type com["ref"] = linkRef.tref com["anchorRef"] = anchorRef.normal() com["sourceRef"] = linkRef.normal() com["sourceHeRef"] = linkRef.he_normal() com["anchorVerse"] = anchorRef.sections[-1] if len( anchorRef.sections) else 0 com["anchorText"] = getattr(link, "anchorText", "") # Pad out the sections list, so that comparison between comment numbers are apples-to-apples lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections)) # Build a decimal comment number based on the last two digits of the section array com["commentaryNum"] = lsections[-1] if len(lsections) == 1 \ else float('{0}.{1:04d}'.format(*lsections[-2:])) if len(lsections) > 1 else 0 if com["category"] in REORDER_RULES: com["category"] = REORDER_RULES[com["category"]][0] if with_text: text = TextFamily(linkRef, context=0, commentary=False) com["text"] = text.text if isinstance( text.text, basestring) else JaggedTextArray( text.text).flatten_to_array() com["he"] = text.he if isinstance(text.he, basestring) else JaggedTextArray( text.he).flatten_to_array() # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi") if com["type"] == "commentary": com["commentator"] = linkRef.book.split(" on ")[0] com["heCommentator"] = linkRef.he_book().split(u" על ")[0] else: if com["category"] == "Commentary": com["category"] = "Quoting Commentary" com["commentator"] = linkRef.index.title com["heCommentator"] = linkRef.index.get_title( "he") if linkRef.index.get_title("he") else com["commentator"] if link.type == "targum": com["category"] = "Targum" if linkRef.index_node.primary_title("he"): com["heTitle"] = linkRef.index_node.primary_title("he") return com
"language" : "he", "versionSource" : "https://he.wikisource.org/wiki/תלמוד_בבלי" }).save() whole_ref = Ref('Rashi on Bava Batra') whole_moved_ref = Ref('Rashbam on Bava Batra') stay_section_ref = Ref('Rashi on Bava Batra.2a.1.1-29a.9.1') move_section_ref = Ref('Rashi on Bava Batra.29a.9.2-176b.4.2') orig_tc = TextChunk(whole_ref, 'he', rashbam_bava_batra_he.versionTitle) dest_tc = TextChunk(whole_moved_ref, 'he', rashbam_bava_batra_he.versionTitle) #get the two slices of the whole text, corresponding to the new texts jatext_tostay = JaggedTextArray(orig_tc.text).subarray_with_ref(stay_section_ref).array() jatext_tomove = copy.deepcopy(JaggedTextArray(orig_tc.text).subarray_with_ref(move_section_ref).array()) #the piece of text being moved needs to be padded so that its overall structure matches the original structure jatext_tostay = pad_moved_ja(jatext_tostay, stay_section_ref.sections) jatext_tomove = pad_moved_ja(jatext_tomove, move_section_ref.sections) orig_tc.text = jatext_tostay orig_tc.save() dest_tc.text = jatext_tomove dest_tc.save() r_gershom_index = Index({ "title":'Rabbeinu Gershom',
def do_copy(self): self.load_objects() if self._post_index: idx_contents = self._index_obj.contents(raw=True) idx_title = self._index_obj.title self.post_terms_from_schema() self._handle_categories() self._make_post_request_to_server( self._prepare_index_api_call(idx_title), idx_contents) content_nodes = self._index_obj.nodes.get_leaf_nodes() for ver in self._version_objs: found_non_empty_content = False print(ver.versionTitle.encode('utf-8')) flags = {} for flag in ver.optional_attrs: if hasattr(ver, flag): flags[flag] = getattr(ver, flag, None) for node_num, node in enumerate(content_nodes, 1): print(node.full_title(force_update=True)) text = JaggedTextArray(ver.content_node(node)).array() version_payload = { "versionTitle": ver.versionTitle, "versionSource": ver.versionSource, "language": ver.language, "text": text } if len(text) > 0: # only bother posting nodes that have content. found_non_empty_content = True if node_num == len(content_nodes): # try: self._make_post_request_to_server( self._prepare_text_api_call( node.full_title(force_update=True), count_after=True), version_payload) # except: # pass else: self._make_post_request_to_server( self._prepare_text_api_call( node.full_title(force_update=True)), version_payload) if not found_non_empty_content: # post the last node again with dummy text, to make sure an actual version db object is created # then post again to clear the dummy text dummy_text = "This is a dummy text" empty = "" for _ in range(node.depth): dummy_text = [dummy_text] empty = [empty] version_payload['text'] = dummy_text self._make_post_request_to_server( self._prepare_text_api_call(node.full_title()), version_payload) version_payload['text'] = empty self._make_post_request_to_server( self._prepare_text_api_call(node.full_title()), version_payload) if flags: self._make_post_request_to_server( self._prepare_version_attrs_api_call( ver.title, ver.language, ver.versionTitle), flags) if self._post_links and len(self._linkset) > 0: if self._post_links_step <= 0 or self._post_links_step > len( self._linkset): self._post_links_step = len(self._linkset) links = [ l.contents() for l in self._linkset if not getattr(l, 'source_text_oid', None) ] for i in range(0, len(links), self._post_links_step): self._make_post_request_to_server( self._prepare_links_api_call(), links[i:i + self._post_links_step])
# -*- coding: utf-8 -*- import argparse import re from sefaria.model import * from sefaria.datatype.jagged_array import JaggedTextArray, JaggedArray from sefaria.system.exceptions import BookNameError all_versions = VersionSet() #all_library_nodes = library.get_content_nodes(with_commentary=True) for version in all_versions: print("{}: {}".format(version.title.encode('utf-8'), version.versionTitle.encode('utf-8'))) version_altered = False try: idx = version.get_index() content_nodes = idx.nodes.get_leaf_nodes() for node in content_nodes: print(node) ja_text = JaggedTextArray(version.content_node(node)) normalized = ja_text.normalize(terminal_depth=node.depth) if normalized: #only set things that were changed. version.sub_content(key_list=node.version_address(), value=ja_text.array()) version_altered = True if version_altered: #only go through save if something actually changed version.save() except BookNameError as e: print("no index for {}".format(version.title.encode('utf-8')))