Example #1
0
def bulktext_api(request, refs):
    """
    Used by the linker.
    :param request:
    :param refs:
    :return:
    """
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        useTextFamily = request.GET.get("useTextFamily", None)
        refs = set(refs.split("|"))
        res = {}
        for tref in refs:
            try:
                oref = model.Ref(tref)
                lang = "he" if is_hebrew(tref) else "en"
                if useTextFamily:
                    text_fam = model.TextFamily(oref,
                                                commentary=0,
                                                context=0,
                                                pad=False)
                    he = text_fam.he
                    en = text_fam.text
                    res[tref] = {
                        'he': he,
                        'en': en,
                        'lang': lang,
                        'ref': oref.normal(),
                        'primary_category':
                        text_fam.contents()['primary_category'],
                        'heRef': oref.he_normal(),
                        'url': oref.url()
                    }
                else:
                    he = model.TextChunk(oref, "he").text
                    en = model.TextChunk(oref, "en").text
                    res[tref] = {
                        'he':
                        he if isinstance(he, basestring) else
                        JaggedTextArray(he).flatten_to_string(
                        ),  # these could be flattened on the client, if need be.
                        'en':
                        en if isinstance(en, basestring) else
                        JaggedTextArray(en).flatten_to_string(),
                        'lang':
                        lang,
                        'ref':
                        oref.normal(),
                        'heRef':
                        oref.he_normal(),
                        'url':
                        oref.url()
                    }
            except (InputError, ValueError, AttributeError, KeyError) as e:
                # referer = request.META.get("HTTP_REFERER", "unknown page")
                # This chatter fills up the logs.  todo: put in it's own file
                # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
                res[tref] = {"error": 1}
        resp = jsonResponse(res, cb)
        return resp
Example #2
0
def bulktext_api(request, refs):
    """
    Used by the linker.
    :param request:
    :param refs:
    :return:
    """
    if request.method == "GET":
        cb = request.GET.get("callback", None)
        refs = set(refs.split("|"))
        res = {}
        for tref in refs:
            try:
                oref = model.Ref(tref)
                lang = "he" if is_hebrew(tref) else "en"
                he = model.TextChunk(oref, "he").text
                en = model.TextChunk(oref, "en").text
                res[tref] = {
                    'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(),  # these could be flattened on the client, if need be.
                    'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(),
                    'lang': lang,
                    'ref': oref.normal(),
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
            except (InputError, ValueError, AttributeError) as e:
                referer = request.META.get("HTTP_REFERER", "unknown page")
                logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
                res[tref] = {"error": 1}
        resp = jsonResponse(res, cb)
        resp['Access-Control-Allow-Origin'] = '*'
        return resp
Example #3
0
def format_link_object_for_client(link, with_text, ref, pos=None):
    """
    :param link: Link object
    :param ref: Ref object of the source of the link
    :param pos: Optional position of the Ref in the Link.  If not passed, it will be derived from the first two arguments.
    :return: Dict
    """
    com = {}

    # The text we're asked to get links to
    anchorRef = Ref(link.refs[pos])

    # The link we found to anchorRef
    linkRef = Ref(link.refs[(pos + 1) % 2])

    com["_id"]           = str(link._id)
    com['index_title']   = linkRef.index.title
    com["category"]      = linkRef.primary_category #usually the index's categories[0] or "Commentary".
    com["type"]          = link.type
    com["ref"]           = linkRef.tref
    com["anchorRef"]     = anchorRef.normal()
    com["sourceRef"]     = linkRef.normal()
    com["sourceHeRef"]   = linkRef.he_normal()
    com["anchorVerse"]   = anchorRef.sections[-1] if len(anchorRef.sections) else 0
    com["anchorText"]    = getattr(link, "anchorText", "")

    # Pad out the sections list, so that comparison between comment numbers are apples-to-apples
    lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections))
    # Build a decimal comment number based on the last two digits of the section array
    com["commentaryNum"] = lsections[-1] if len(lsections) == 1 \
            else float('{0}.{1:04d}'.format(*lsections[-2:])) if len(lsections) > 1 else 0

    if with_text:
        text             = TextFamily(linkRef, context=0, commentary=False)
        com["text"]      = text.text if isinstance(text.text, basestring) else JaggedTextArray(text.text).flatten_to_array()
        com["he"]        = text.he if isinstance(text.he, basestring) else JaggedTextArray(text.he).flatten_to_array()

    # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi")
    # this is now simpler, and there is explicit data on the index record for it.
    if com["type"] == "commentary":
        com["linkGroupTitle"] = {
            'en': getattr(linkRef.index, 'collective_title', linkRef.index.title),
            'he': hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he")))
        }
        com["commentator"] = getattr(linkRef.index, 'collective_title', linkRef.index.title) # TODO: deprecate
        com["heCommentator"] = hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he"))) # TODO: deprecate
    else:
        com["linkGroupTitle"] = {'en': linkRef.index.title, 'he': linkRef.index.get_title("he")}
        com["commentator"] = linkRef.index.title # TODO: deprecate
        com["heCommentator"] = linkRef.index.get_title("he") # TODO: deprecate

    if com["type"] != "commentary" and com["category"] == "Commentary":
            com["category"] = "Quoting Commentary"
            #add a fix here for quoting commentary appearing together with commentary in s2 panels

    if linkRef.index_node.primary_title("he"):
        com["heTitle"] = linkRef.index_node.primary_title("he")

    return com
Example #4
0
 def __init__(self, **kwargs):
     self.commentator = kwargs['en_title']
     self.he_commentator = kwargs['he_title']
     self.book = kwargs['book']
     self.he_book = kwargs['he_book']
     self.book_id = kwargs['bid']
     self.hilchot_id = kwargs['HilchotId']
     self.ja = JaggedTextArray()
Example #5
0
def format_link_object_for_client(link, with_text, ref, pos=None):
    """
    :param link: Link object
    :param ref: Ref object of the source of the link
    :param pos: Optional position of the Ref in the Link.  If not passed, it will be derived from the first two arguments.
    :return: Dict
    """
    com = {}

    # The text we're asked to get links to
    anchorRef = Ref(link.refs[pos])

    # The link we found to anchorRef
    linkRef = Ref(link.refs[(pos + 1) % 2])

    com["_id"] = str(link._id)
    com['index_title'] = linkRef.index.title
    com["category"] = linkRef.type
    com["type"] = link.type
    com["ref"] = linkRef.tref
    com["anchorRef"] = anchorRef.normal()
    com["sourceRef"] = linkRef.normal()
    com["sourceHeRef"] = linkRef.he_normal()
    com["anchorVerse"] = anchorRef.sections[-1] if len(
        anchorRef.sections) else 0
    com["commentaryNum"] = linkRef.sections[
        -1] if linkRef.type == "Commentary" else 0
    com["anchorText"] = getattr(link, "anchorText", "")

    if com["category"] in REORDER_RULES:
        com["category"] = REORDER_RULES[com["category"]][0]

    if with_text:
        text = TextFamily(linkRef, context=0, commentary=False)
        com["text"] = JaggedTextArray(text.text).flatten_to_array()
        com["he"] = JaggedTextArray(text.he).flatten_to_array()

    # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi")
    if com["type"] == "commentary":
        com["commentator"] = linkRef.book.split(" on ")[0]
        com["heCommentator"] = linkRef.he_book().split(u" על ")[0]
    else:
        if com["category"] == "Commentary":
            com["category"] = "Quoting Commentary"
        com["commentator"] = linkRef.index.title
        com["heCommentator"] = linkRef.index.get_title(
            "he") if linkRef.index.get_title("he") else com["commentator"]

    if link.type == "targum":
        com["category"] = "Targum"

    if linkRef.index_node.primary_title("he"):
        com["heTitle"] = linkRef.index_node.primary_title("he")

    return com
Example #6
0
def format_link_object_for_client(link, with_text, ref, pos=None):
    """
    :param link: Link object
    :param ref: Ref object of the source of the link
    :param pos: Optional position of the Ref in the Link.  If not passed, it will be derived from the first two arguments.
    :return: Dict
    """
    com = {}

    # The text we're asked to get links to
    anchorRef = Ref(link.refs[pos])

    # The link we found to anchorRef
    linkRef = Ref(link.refs[(pos + 1) % 2])

    com["_id"] = str(link._id)
    com["category"] = linkRef.type
    com["type"] = link.type
    com["ref"] = linkRef.tref
    com["anchorRef"] = anchorRef.normal()
    com["sourceRef"] = linkRef.normal()
    com["sourceHeRef"] = linkRef.he_normal()
    com["anchorVerse"] = anchorRef.sections[-1]
    com["commentaryNum"] = linkRef.sections[
        -1] if linkRef.type == "Commentary" else 0
    com["anchorText"] = getattr(link, "anchorText", "")

    if with_text:
        #from sefaria.texts import get_text
        #text             = get_text(linkRef.normal(), context=0, commentary=False)
        text = TextFamily(linkRef, context=0, commentary=False)
        #com["text"]      = text["text"] if text["text"] else ""
        #com["he"]        = text["he"] if text["he"] else ""
        com["text"] = JaggedTextArray(text.text).flatten_to_array()
        com["he"] = JaggedTextArray(text.he).flatten_to_array()

    # strip redundant verse ref for commentators
    # if the ref we're looking for appears exactly in the commentary ref, strip redundant info
    #todo: this comparison - ref in linkRef.normal() - seems brittle.  Make it rigorous.
    if com["category"] == "Commentary" and ref in linkRef.normal():
        com["commentator"] = linkRef.index.commentator
        com["heCommentator"] = linkRef.index.heCommentator if getattr(
            linkRef.index, "heCommentator", None) else com["commentator"]
    else:
        com["commentator"] = linkRef.book
        com["heCommentator"] = linkRef.index_node.primary_title(
            "he") if linkRef.index_node.primary_title(
                "he") else com["commentator"]

    if linkRef.index_node.primary_title("he"):
        com["heTitle"] = linkRef.index_node.primary_title("he")

    return com
Example #7
0
 def content_node_merger(snode, *contents, **kwargs):
     """
     :param snode: SchemaContentNode
     :param contents: Length two array of content.  Second is merged into first and returned.
     :param kwargs: "sources": array of source names
     :return:
     """
     assert len(contents) == 2
     if warn and JaggedTextArray(contents[0]).overlaps(JaggedTextArray(contents[1])):
         raise Exception("WARNING - overlapping content in {}".format(snode.full_title()))
     merged_text, sources = merge_texts([contents[0], contents[1]], kwargs.get("sources"))
     return merged_text
Example #8
0
def bundle_many_texts(refs, useTextFamily=False, as_sized_string=False, min_char=None, max_char=None):
    res = {}
    for tref in refs:
        try:
            oref = model.Ref(tref)
            lang = "he" if is_hebrew(tref) else "en"
            if useTextFamily:
                text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False)
                he = text_fam.he
                en = text_fam.text
                res[tref] = {
                    'he': he,
                    'en': en,
                    'lang': lang,
                    'ref': oref.normal(),
                    'primary_category': text_fam.contents()['primary_category'],
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
            else:
                he_tc = model.TextChunk(oref, "he")
                en_tc = model.TextChunk(oref, "en")
                if as_sized_string:
                    kwargs = {}
                    if min_char:
                        kwargs['min_char'] = min_char
                    if max_char:
                        kwargs['max_char'] = max_char
                    he_text = he_tc.as_sized_string(**kwargs)
                    en_text = en_tc.as_sized_string(**kwargs)
                else:
                    he = he_tc.text
                    en = en_tc.text
                    # these could be flattened on the client, if need be.
                    he_text = he if isinstance(he, str) else JaggedTextArray(he).flatten_to_string()
                    en_text = en if isinstance(en, str) else JaggedTextArray(en).flatten_to_string()

                res[tref] = {
                    'he': he_text,
                    'en': en_text,
                    'lang': lang,
                    'ref': oref.normal(),
                    'heRef': oref.he_normal(),
                    'url': oref.url()
                }
        except (InputError, ValueError, AttributeError, KeyError) as e:
            # referer = request.META.get("HTTP_REFERER", "unknown page")
            # This chatter fills up the logs.  todo: put in it's own file
            # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e))
            res[tref] = {"error": 1}
    return res
Example #9
0
 def do_copy(self):
     self.load_objects()
     if self._post_index:
         if isinstance(self._index_obj, CommentaryIndex):
             idx_contents = self._index_obj.c_index.contents(raw=True)
             idx_title = self._index_obj.c_index.title
         elif isinstance(self._index_obj, Index):
             idx_contents = self._index_obj.contents(raw=True)
             idx_title = self._index_obj.title
         self._make_post_request_to_server(
             self._prepare_index_api_call(idx_title), idx_contents)
     content_nodes = self._index_obj.nodes.get_leaf_nodes()
     for ver in self._version_objs:
         found_non_empty_content = False
         print ver.versionTitle.encode('utf-8')
         flags = {}
         for flag in ver.optional_attrs:
             if hasattr(ver, flag):
                 flags[flag] = getattr(ver, flag, None)
         for node in content_nodes:
             print node.full_title(force_update=True)
             text = JaggedTextArray(ver.content_node(node)).array()
             version_payload = {
                 "versionTitle": ver.versionTitle,
                 "versionSource": ver.versionSource,
                 "language": ver.language,
                 "text": text
             }
             if len(text) > 0:
                 # only bother posting nodes that have content.
                 found_non_empty_content = True
                 self._make_post_request_to_server(
                     self._prepare_text_api_call(
                         node.full_title(force_update=True)),
                     version_payload)
         if not found_non_empty_content:
             # post the last node again with dummy text, to make sure an actual version db object is created
             # then post again to clear the dummy text
             dummy_text = "This is a dummy text"
             for _ in range(node.depth):
                 dummy_text = [dummy_text]
             version_payload['text'] = dummy_text
             self._make_post_request_to_server(
                 self._prepare_text_api_call(node.full_title()),
                 version_payload)
             version_payload['text'] = []
             self._make_post_request_to_server(
                 self._prepare_text_api_call(node.full_title()),
                 version_payload)
         if flags:
             self._make_post_request_to_server(
                 self._prepare_version_attrs_api_call(
                     ver.title, ver.language, ver.versionTitle), flags)
     if self._post_links:
         links = [
             l.contents() for l in self._linkset
             if not getattr(l, 'source_text_oid', None)
         ]
         self._make_post_request_to_server(self._prepare_links_api_call(),
                                           links)
Example #10
0
 def do_copy(self):
     self.load_objects()
     if self._post_index:
     	pdb.set_trace()
         if isinstance(self._index_obj, CommentaryIndex):
             idx_contents = self._index_obj.c_index.contents(raw=True)
             idx_title = self._index_obj.c_index.title
         elif isinstance(self._index_obj, Index):
             idx_contents = self._index_obj.contents(raw=True)
             idx_title = self._index_obj.title
         self._make_post_request_to_server(self._prepare_index_api_call(idx_title), idx_contents)
     content_nodes = self._index_obj.nodes.get_leaf_nodes()
     pdb.set_trace()
     for ver in self._version_objs:
         print ver.versionTitle.encode('utf-8')
         flags = {}
         for flag in ver.optional_attrs:
             if hasattr(ver, flag):
                 flags[flag] = getattr(ver, flag, None)
         for node in content_nodes:
             #print node.full_title(force_update=True)
             text = JaggedTextArray(ver.content_node(node)).array()
             version_payload = {
                 "versionTitle": ver.versionTitle,
                 "versionSource": ver.versionSource,
                 "language": ver.language,
                 "text": text
             }
             self._make_post_request_to_server(self._prepare_text_api_call(node.full_title(force_update=True)), version_payload)
         if flags:
             self._make_post_request_to_server(self._prepare_version_attrs_api_call(ver.title, ver.language, ver.versionTitle), flags)
     if self._post_links:
         links = [l.contents() for l in self._linkset]
         self._make_post_request_to_server(self._prepare_links_api_call(), links)
Example #11
0
    def _node_count(self, snode, lang="en"):
        """
        Count available versions of a text in the db, segment by segment.
        :return counts:
        :type return: JaggedIntArray
        """
        counts = JaggedIntArray()

        versions = self.versions(lang)
        for version in versions:
            raw_text_ja = version.content_node(snode)
            ja = JaggedTextArray(raw_text_ja)
            mask = ja.mask()
            counts = counts + mask

        return counts
Example #12
0
    def _node_count(self, snode, lang="en"):
        """
        Count available versions of a text in the db, segment by segment.
        :return counts:
        :type return: JaggedIntArray
        """
        counts = JaggedIntArray()

        versions = self.versions(lang)
        for version in versions:
            raw_text_ja = version.content_node(snode)
            ja = JaggedTextArray(raw_text_ja)
            mask = ja.mask()
            counts = counts + mask

        return counts
Example #13
0
def split_text_section(oref, lang, old_version_title, new_version_title):
    """
    Splits the text in `old_version_title` so that the content covered by `oref` now appears in `new_version_title`.
    Rewrites history for affected content. 

    NOTE: `oref` cannot be ranging (until we implement saving ranging refs on TextChunk). Spanning refs are handled recursively.
    """
    if oref.is_spanning():
        for span in oref.split_spanning_ref():
            split_text_section(span, lang, old_version_title, new_version_title)
        return

    old_chunk = TextChunk(oref, lang=lang, vtitle=old_version_title)
    new_chunk = TextChunk(oref, lang=lang, vtitle=new_version_title)

    # Copy content to new version
    new_chunk.versionSource = old_chunk.version().versionSource
    new_chunk.text = old_chunk.text
    new_chunk.save()

    # Rewrite History
    ref_regex_queries = [{"ref": {"$regex": r}, "version": old_version_title, "language": lang} for r in oref.regex(as_list=True)]
    query = {"$or": ref_regex_queries}
    db.history.update(query, {"$set": {"version": new_version_title}}, upsert=False, multi=True)

    # Remove content from old version
    old_chunk.text = JaggedTextArray(old_chunk.text).constant_mask(constant="").array()
    old_chunk.save()
Example #14
0
def test_modify_text_by_function():

    original = TextChunk(Ref("Job"), vtitle="The Holy Scriptures: A New Translation (JPS 1917)")
    total_spaces = JaggedTextArray(original.text).flatten_to_string(joiner="|").count(" ")

    v = Version({
        "language": "en",
        "title": "Job",
        "versionSource": "http://foobar.com",
        "versionTitle": "TextChangeTest",
        "chapter": original.text
    }).save()

    modify_text_by_function("Job", "TextChangeTest", "en", lambda x, sections: x.replace(" ", "$"), 23432)
    modified = TextChunk(Ref("Job"), vtitle="TextChangeTest")
    total_dollars = JaggedTextArray(modified.text).flatten_to_string(joiner="|").count("$")
    v.delete()
    assert total_dollars > 0
    assert total_spaces == total_dollars
Example #15
0
def resize_text(title, new_structure, upsize_in_place=False):
    # todo: Needs to be converted to objects, but no usages seen in the wild.
    """
    Change text structure for text named 'title'
    to 'new_structure' (a list of strings naming section names)

    Changes index record as well as restructuring any text that is currently saved.

    When increasing size, any existing text will become the first segment of the new level
    ["One", "Two", "Three"] -> [["One"], ["Two"], ["Three"]]

    If upsize_in_place==True, existing text will stay in tact, but be wrapped in new depth:
    ["One", "Two", "Three"] -> [["One", "Two", "Three"]]

    When decreasing size, information is lost as any existing segments are concatenated with " "
    [["One1", "One2"], ["Two1", "Two2"], ["Three1", "Three2"]] - >["One1 One2", "Two1 Two2", "Three1 Three2"]

    """
    index = db.index.find_one({"title": title})
    if not index:
        return False

    old_structure = index["sectionNames"]
    index["sectionNames"] = new_structure
    db.index.save(index)

    delta = len(new_structure) - len(old_structure)
    if delta == 0:
        return True

    texts = db.texts.find({"title": title})
    for text in texts:
        if delta > 0 and upsize_in_place:
            resized = text["chapter"]
            for i in range(delta):
                resized = [resized]
        else:
            resized = JaggedTextArray(text["chapter"]).resize(delta).array()

        text["chapter"] = resized
        db.texts.save(text)

    # TODO Rewrite any existing Links
    # TODO Rewrite any exisitng History items

    summaries.update_summaries_on_change(title)
    scache.reset_texts_cache()

    return True
Example #16
0
def count_completed_translation_requests():
    """
    Returns stats about completed translation requests.
    """
    featured = 0
    words = 0
    sct_words = 0
    featured_words = 0
    featured_sct_words = 0

    trs = TranslationRequestSet({"completed": True})

    count = trs.count()

    for tr in trs:
        oref = text.Ref(tr.ref)
        t = oref.text().text
        is_sct = not oref.text().is_merged and oref.text().version(
        ) and oref.text().version(
        ).versionTitle == "Sefaria Community Translation"
        n = JaggedTextArray(t).word_count()
        words += n
        sct_words += n if is_sct else 0
        if getattr(tr, "featured", False):
            featured += 1
            featured_words += n
            featured_sct_words += n if is_sct else 0

    out = "%d total translation requests completed.\n" % count
    out += "%d total words of translation added.\n" % words
    out += "%d total words of translation created.\n" % sct_words
    out += "******\n"
    out += "%d featured translation requests completed.\n" % featured
    out += "%d total words of translation added from featured requests.\n" % featured_words
    out += "%d total words of translation created from featured requests.\n" % featured_sct_words

    return out
Example #17
0
 def word_count(self):
     return JaggedTextArray(self.as_strings()).word_count()
# -*- coding: utf-8 -*-

import argparse
import re
from sefaria.model import *
from sefaria.datatype.jagged_array import JaggedTextArray, JaggedArray
from sefaria.system.exceptions import BookNameError


all_versions = VersionSet()
#all_library_nodes = library.get_content_nodes(with_commentary=True)
for version in all_versions:
    print "{}: {}".format(version.title.encode('utf-8'), version.versionTitle.encode('utf-8'))
    version_altered = False
    try:
        idx = version.get_index()
        content_nodes = idx.nodes.get_leaf_nodes()
        for node in content_nodes:
            print node
            ja_text = JaggedTextArray(version.content_node(node))
            normalized = ja_text.normalize(terminal_depth=node.depth)
            if normalized: #only set things that were changed.
                version.sub_content(key_list=node.version_address(), value=ja_text.array())
                version_altered = True
        if version_altered: #only go through save if something actually changed
            version.save()
    except BookNameError as e:
        print "no index for {}".format(version.title.encode('utf-8'))


Example #19
0
class Commentary:
    def __init__(self, **kwargs):
        self.commentator = kwargs['en_title']
        self.he_commentator = kwargs['he_title']
        self.book = kwargs['book']
        self.he_book = kwargs['he_book']
        self.book_id = kwargs['bid']
        self.hilchot_id = kwargs['HilchotId']
        self.ja = JaggedTextArray()

    @classmethod
    def build_from_row(cls, row):
        if row['HilchotId'] == 2:
            raise CommentaryError

        en_title, he_title = cls.book_titles_from_row(row)
        init_args = {
            'en_title': row['en_title'],
            'he_title': row['name'],
            'book': en_title,
            'he_book': he_title,
            'bid': row['bid'],
            'HilchotId': row['HilchotId'],
        }
        return cls(**init_args)

    @staticmethod
    def book_titles_from_row(row):
        mishneh_torah = 'משנה תורה'
        full_title = f'{mishneh_torah}, {row["Hilchot"]}'
        en_title = sef_obj.Ref(full_title).normal()
        return en_title, full_title

    def is_part_of_commentary(self, row):
        return (self.book_id, self.hilchot_id) == (row['bid'],
                                                   row['HilchotId'])

    def add_segment(self, segment: str, indices: tuple) -> None:
        final_index = self.ja.sub_array_length(indices)
        if final_index is None:
            final_index = 0
        indices = indices + (final_index, )
        self.ja.set_element(indices, segment)

    def add_segments_from_row(self, row):
        segments = self.build_segments(row['text'])
        indices = self.get_indices_for_row(row)
        for segment in segments:
            self.add_segment(segment, indices)

    @staticmethod
    def get_ja(title, he_title) -> dict:
        ja = sef_obj.JaggedArrayNode()
        ja.add_primary_titles(title, he_title)
        ja.add_structure(['Chapter', 'Halakhah', 'Comment'])
        ja.validate()
        return ja.serialize()

    def generate_index(self) -> dict:
        title, he_title = f'{self.commentator} on {self.book}', f'{self.he_commentator} על {self.he_book}'

        return {
            'title': title,
            'categories': self.get_category(),
            'dependence': 'Commentary',
            'base_text_titles': [self.book],
            'schema': self.get_ja(title, he_title),
            'collective_title': self.commentator,
            'base_text_mapping': 'many_to_one'
        }

    def build_version(self) -> dict:
        return {
            'versionTitle': 'Friedberg Edition',
            'versionSource': 'https://fjms.genizah.org',
            'language': 'he',
            'text': self.ja.array()
        }

    @staticmethod
    def build_segments(segment: str) -> list:
        segment_xml = '<root>{}</root>'.format(segment)
        segment_soup = BeautifulSoup(segment_xml, 'xml')
        segment_root = segment_soup.root

        # clear out multiple classes - we're only interested in the last letter in the class
        for span in segment_root.find_all('span'):
            klass = span.get('class', '')
            if klass and isinstance(klass, list):
                span['class'] = span['class'][-1]

        # consolidate duplicate tags and unwrap meaningless tags
        for span in segment_root.find_all('span'):
            previous = span.previous_sibling
            if not previous:
                continue

            # make sure all text inside spans end with a space, we'll remove duplicates later
            if span.string:
                span.string.replace_with(
                    NavigableString(' {}'.format(span.string)))

            if span.get('class', '') == '':
                span.unwrap()

            elif span.name == previous.name and span.get(
                    'class') == previous.get('class'):
                previous.append(span)
                span.unwrap()

        # handle footnotes
        while True:
            marker = segment_root.find('span', attrs={'class': 'R'})
            note_tag = segment_root.find('span', attrs={'class': 'N'})
            if marker and note_tag:
                marker.name = 'sup'
                del marker['class']
                note_text = note_tag.text
                note_text = re.sub(r'^{}\s'.format(re.escape(marker.text)), '',
                                   note_text)
                new_note = segment_soup.new_tag('i')
                new_note['class'] = 'footnote'
                new_note.string = note_text
                marker.insert_after(new_note)
                note_tag.decompose()
            else:
                break

        markup = segment_root.find_all('span', class_=re.compile('[BZS]'))
        for b in markup:
            if b['class'] == 'S':
                b.name = 'small'
            elif b['class'] == 'Z':
                b.name = 'quote'
            else:
                b.name = 'b'
            del b['class']

        segment_text = segment_root.decode_contents()
        segment_text = re.sub(r'^\s+|\s+$', '', segment_text)
        segment_text = re.sub(r'\s{2,}', ' ', segment_text)
        segment_text = re.sub(r'\s*<br/>\s*', '<br/>', segment_text)
        segment_text = re.sub(r'\s*(<br/>)+$', '', segment_text)

        # break on quotes which immediately follow a break
        broken_segments = re.split(r'<br/>(?=<quote>)', segment_text)
        broken_segments = [
            re.sub(r'quote', 'b', seg) for seg in broken_segments
        ]
        return broken_segments

    @staticmethod
    def get_indices_for_row(row) -> tuple:
        def adjust(value: int):
            return value - 1 if value > 0 else value

        return adjust(row['PerekId']), adjust(row['HalachaId'])

    def get_term_data(self) -> tuple:
        return self.commentator, self.he_commentator

    def get_category(self) -> tuple:
        rambam_index = sef_obj.library.get_index(self.book)
        return ('Halakhah', 'Mishneh Torah', 'Commentary', self.commentator,
                rambam_index.categories[-1])
Example #20
0
def format_link_object_for_client(link, with_text, ref, pos=None):
    """
    :param link: Link object
    :param ref: Ref object of the source of the link
    :param pos: Position of the Ref in the Link.  If not passed, it will be derived from the first two arguments.
    :return: Dict
    """
    com = {}

    # The text we're asked to get links to
    anchorTref = link.refs[pos]
    anchorRef  = Ref(anchorTref)
    anchorTrefExpanded = getattr(link, "expandedRefs{}".format(pos))

    # The link we found to anchorRef
    linkPos   = (pos + 1) % 2
    linkTref  = link.refs[linkPos]
    linkRef   = Ref(linkTref)
    langs     = getattr(link, "availableLangs", [[],[]])
    linkLangs = langs[linkPos]

    com["_id"]               = str(link._id)
    com['index_title']       = linkRef.index.title
    com["category"]          = linkRef.primary_category #usually the index's categories[0] or "Commentary".
    com["type"]              = link.type
    com["ref"]               = linkTref
    com["anchorRef"]         = anchorTref
    com["anchorRefExpanded"] = anchorTrefExpanded
    com["sourceRef"]         = linkTref
    com["sourceHeRef"]       = linkRef.he_normal()
    com["anchorVerse"]       = anchorRef.sections[-1] if len(anchorRef.sections) else 0
    com["sourceHasEn"]       = "en" in linkLangs
    # com["anchorText"]        = getattr(link, "anchorText", "") # not currently used
    if getattr(link, "inline_reference", None):
        com["inline_reference"]  = getattr(link, "inline_reference", None)
    if getattr(link, "highlightedWords", None):
        com["highlightedWords"] = getattr(link, "highlightedWords", None)

    compDate = getattr(linkRef.index, "compDate", None)
    if compDate:
        com["compDate"] = int(compDate)
        try:
            com["errorMargin"] = int(getattr(linkRef.index, "errorMargin", 0))
        except ValueError:
            com["errorMargin"] = 0

    # Pad out the sections list, so that comparison between comment numbers are apples-to-apples
    lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections))
    # Build a decimal comment number based on the last two digits of the section array
    com["commentaryNum"] = lsections[-1] if len(lsections) == 1 \
            else float('{0}.{1:04d}'.format(*lsections[-2:])) if len(lsections) > 1 else 0

    if with_text:
        text             = TextFamily(linkRef, context=0, commentary=False)
        com["text"]      = text.text if isinstance(text.text, basestring) else JaggedTextArray(text.text).flatten_to_array()
        com["he"]        = text.he if isinstance(text.he, basestring) else JaggedTextArray(text.he).flatten_to_array()

    # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi")
    # this is now simpler, and there is explicit data on the index record for it.
    if com["type"] == "commentary":
        com["collectiveTitle"] = {
            'en': getattr(linkRef.index, 'collective_title', linkRef.index.title),
            'he': hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he")))
        }
    else:
        com["collectiveTitle"] = {'en': linkRef.index.title, 'he': linkRef.index.get_title("he")}

    if com["type"] != "commentary" and com["category"] == "Commentary":
            com["category"] = "Quoting Commentary"

    if com["category"] == "Modern Works" and getattr(linkRef.index, "dependence", None) == "Commentary":
        # print "Transforming " + linkRef.normal()
        com["category"] = "Modern Commentary"
        com["collectiveTitle"] = {
            'en': getattr(linkRef.index, 'collective_title', linkRef.index.title),
            'he': hebrew_term(getattr(linkRef.index, 'collective_title', linkRef.index.get_title("he")))
        }

    if linkRef.index_node.primary_title("he"):
        com["heTitle"] = linkRef.index_node.primary_title("he")

    return com
Example #21
0
def get_links(tref, with_text=True):
    """
    Return a list of links tied to 'ref' in client format.
    If with_text, retrieve texts for each link.
    """
    links = []
    oref = Ref(tref)
    nRef = oref.normal()
    lenRef = len(nRef)
    reRef = oref.regex() if oref.is_range() else None

    # for storing all the section level texts that need to be looked up
    texts = {}

    linkset = LinkSet(oref)
    # For all links that mention ref (in any position)
    for link in linkset:
        # each link contains 2 refs in a list
        # find the position (0 or 1) of "anchor", the one we're getting links for
        if reRef:
            pos = 0 if re.match(reRef, link.refs[0]) else 1
        else:
            pos = 0 if nRef == link.refs[0][:lenRef] else 1
        try:
            com = format_link_object_for_client(link, False, nRef, pos)
        except InputError:
            # logger.warning("Bad link: {} - {}".format(link.refs[0], link.refs[1]))
            continue
        except AttributeError as e:
            logger.error(
                u"AttributeError in presenting link: {} - {} : {}".format(
                    link.refs[0], link.refs[1], e))
            continue

        # Rather than getting text with each link, walk through all links here,
        # caching text so that redundant DB calls can be minimized
        # If link is spanning, split into section refs and rejoin
        try:
            if with_text:
                original_com_oref = Ref(com["ref"])
                com_orefs = original_com_oref.split_spanning_ref()
                for com_oref in com_orefs:
                    top_oref = com_oref.top_section_ref()

                    # Lookup and save top level text, only if we haven't already
                    top_nref = top_oref.normal()
                    if top_nref not in texts:
                        texts[top_nref] = TextFamily(top_oref,
                                                     context=0,
                                                     commentary=False,
                                                     pad=False).contents()
                        for t in ["text", "he"]:
                            texts[top_nref][t] = JaggedTextArray(
                                texts[top_nref][t])
                    sections, toSections = com_oref.sections[
                        1:], com_oref.toSections[1:]
                    for t in ["text", "he"]:
                        res = texts[top_nref][t].subarray(
                            [i - 1 for i in sections],
                            [i - 1 for i in toSections]).array()
                        if t not in com:
                            com[t] = res
                        else:
                            if isinstance(com[t], basestring):
                                com[t] = [com[t]]
                            com[t] += res
                        '''
                        next_section = grab_section_from_text(sections, texts[top_nref][t], toSections)
                        if t not in com:
                            com[t] = next_section
                        elif isinstance(com[t], list):
                            if isinstance(next_section, list):
                                com[t] += next_section
                            else:
                                com[t] += [next_section]
                        else: #com[t] is string
                            if isinstance(next_section, list):
                                com[t] = [com[t]] + next_section
                            else:
                                com[t] += u" " + next_section
                        '''
            links.append(com)
        except NoVersionFoundError as e:
            logger.warning(
                "Trying to get non existent text for ref '{}'. Link refs were: {}"
                .format(top_nref, link.refs))
            continue
    return links
Example #22
0
def format_link_object_for_client(link, with_text, ref, pos=None):
    """
    :param link: Link object
    :param ref: Ref object of the source of the link
    :param pos: Optional position of the Ref in the Link.  If not passed, it will be derived from the first two arguments.
    :return: Dict
    """
    com = {}

    # The text we're asked to get links to
    anchorRef = Ref(link.refs[pos])

    # The link we found to anchorRef
    linkRef = Ref(link.refs[(pos + 1) % 2])

    com["_id"] = str(link._id)
    com['index_title'] = linkRef.index.title
    com["category"] = linkRef.type
    com["type"] = link.type
    com["ref"] = linkRef.tref
    com["anchorRef"] = anchorRef.normal()
    com["sourceRef"] = linkRef.normal()
    com["sourceHeRef"] = linkRef.he_normal()
    com["anchorVerse"] = anchorRef.sections[-1] if len(
        anchorRef.sections) else 0
    com["anchorText"] = getattr(link, "anchorText", "")

    # Pad out the sections list, so that comparison between comment numbers are apples-to-apples
    lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth -
                                             len(linkRef.sections))
    # Build a decimal comment number based on the last two digits of the section array
    com["commentaryNum"] = lsections[-1] if len(lsections) == 1 \
            else float('{0}.{1:04d}'.format(*lsections[-2:])) if len(lsections) > 1 else 0

    if com["category"] in REORDER_RULES:
        com["category"] = REORDER_RULES[com["category"]][0]

    if with_text:
        text = TextFamily(linkRef, context=0, commentary=False)
        com["text"] = text.text if isinstance(
            text.text, basestring) else JaggedTextArray(
                text.text).flatten_to_array()
        com["he"] = text.he if isinstance(text.he,
                                          basestring) else JaggedTextArray(
                                              text.he).flatten_to_array()

    # if the the link is commentary, strip redundant info (e.g. "Rashi on Genesis 4:2" -> "Rashi")
    if com["type"] == "commentary":
        com["commentator"] = linkRef.book.split(" on ")[0]
        com["heCommentator"] = linkRef.he_book().split(u" על ")[0]
    else:
        if com["category"] == "Commentary":
            com["category"] = "Quoting Commentary"
        com["commentator"] = linkRef.index.title
        com["heCommentator"] = linkRef.index.get_title(
            "he") if linkRef.index.get_title("he") else com["commentator"]

    if link.type == "targum":
        com["category"] = "Targum"

    if linkRef.index_node.primary_title("he"):
        com["heTitle"] = linkRef.index_node.primary_title("he")

    return com
Example #23
0
# -*- coding: utf-8 -*-

import argparse
import re
from sefaria.model import *
from sefaria.datatype.jagged_array import JaggedTextArray, JaggedArray
from sefaria.system.exceptions import BookNameError

all_versions = VersionSet()
#all_library_nodes = library.get_content_nodes(with_commentary=True)
for version in all_versions:
    print "{}: {}".format(version.title.encode('utf-8'),
                          version.versionTitle.encode('utf-8'))
    version_altered = False
    try:
        idx = version.get_index()
        content_nodes = idx.nodes.get_leaf_nodes()
        for node in content_nodes:
            print node
            ja_text = JaggedTextArray(version.content_node(node))
            normalized = ja_text.normalize(terminal_depth=node.depth)
            if normalized:  #only set things that were changed.
                version.sub_content(key_list=node.version_address(),
                                    value=ja_text.array())
                version_altered = True
        if version_altered:  #only go through save if something actually changed
            version.save()
    except BookNameError as e:
        print "no index for {}".format(version.title.encode('utf-8'))
    "language" : "he",
    "versionSource" : "https://he.wikisource.org/wiki/תלמוד_בבלי"
}).save()

whole_ref = Ref('Rashi on Bava Batra')
whole_moved_ref = Ref('Rashbam on Bava Batra')
stay_section_ref = Ref('Rashi on Bava Batra.2a.1.1-29a.9.1')
move_section_ref = Ref('Rashi on Bava Batra.29a.9.2-176b.4.2')



orig_tc = TextChunk(whole_ref, 'he', rashbam_bava_batra_he.versionTitle)
dest_tc = TextChunk(whole_moved_ref, 'he', rashbam_bava_batra_he.versionTitle)

#get the two slices of the whole text, corresponding to the new texts
jatext_tostay = JaggedTextArray(orig_tc.text).subarray_with_ref(stay_section_ref).array()

jatext_tomove = copy.deepcopy(JaggedTextArray(orig_tc.text).subarray_with_ref(move_section_ref).array())
#the piece of text being moved needs to be padded so that its overall structure matches the original structure
jatext_tostay = pad_moved_ja(jatext_tostay, stay_section_ref.sections)
jatext_tomove = pad_moved_ja(jatext_tomove, move_section_ref.sections)

orig_tc.text = jatext_tostay
orig_tc.save()

dest_tc.text = jatext_tomove
dest_tc.save()


r_gershom_index = Index({
    "title":'Rabbeinu Gershom',
Example #25
0
 def do_copy(self):
     self.load_objects()
     if self._post_index:
         idx_contents = self._index_obj.contents(raw=True)
         idx_title = self._index_obj.title
         self.post_terms_from_schema()
         self._handle_categories()
         self._make_post_request_to_server(
             self._prepare_index_api_call(idx_title), idx_contents)
     content_nodes = self._index_obj.nodes.get_leaf_nodes()
     for ver in self._version_objs:
         found_non_empty_content = False
         print(ver.versionTitle.encode('utf-8'))
         flags = {}
         for flag in ver.optional_attrs:
             if hasattr(ver, flag):
                 flags[flag] = getattr(ver, flag, None)
         for node_num, node in enumerate(content_nodes, 1):
             print(node.full_title(force_update=True))
             text = JaggedTextArray(ver.content_node(node)).array()
             version_payload = {
                 "versionTitle": ver.versionTitle,
                 "versionSource": ver.versionSource,
                 "language": ver.language,
                 "text": text
             }
             if len(text) > 0:
                 # only bother posting nodes that have content.
                 found_non_empty_content = True
                 if node_num == len(content_nodes):
                     # try:
                     self._make_post_request_to_server(
                         self._prepare_text_api_call(
                             node.full_title(force_update=True),
                             count_after=True), version_payload)
                     # except:
                     #     pass
                 else:
                     self._make_post_request_to_server(
                         self._prepare_text_api_call(
                             node.full_title(force_update=True)),
                         version_payload)
         if not found_non_empty_content:
             # post the last node again with dummy text, to make sure an actual version db object is created
             # then post again to clear the dummy text
             dummy_text = "This is a dummy text"
             empty = ""
             for _ in range(node.depth):
                 dummy_text = [dummy_text]
                 empty = [empty]
             version_payload['text'] = dummy_text
             self._make_post_request_to_server(
                 self._prepare_text_api_call(node.full_title()),
                 version_payload)
             version_payload['text'] = empty
             self._make_post_request_to_server(
                 self._prepare_text_api_call(node.full_title()),
                 version_payload)
         if flags:
             self._make_post_request_to_server(
                 self._prepare_version_attrs_api_call(
                     ver.title, ver.language, ver.versionTitle), flags)
     if self._post_links and len(self._linkset) > 0:
         if self._post_links_step <= 0 or self._post_links_step > len(
                 self._linkset):
             self._post_links_step = len(self._linkset)
         links = [
             l.contents() for l in self._linkset
             if not getattr(l, 'source_text_oid', None)
         ]
         for i in range(0, len(links), self._post_links_step):
             self._make_post_request_to_server(
                 self._prepare_links_api_call(),
                 links[i:i + self._post_links_step])
Example #26
0
# -*- coding: utf-8 -*-

import argparse
import re
from sefaria.model import *
from sefaria.datatype.jagged_array import JaggedTextArray, JaggedArray
from sefaria.system.exceptions import BookNameError


all_versions = VersionSet()
#all_library_nodes = library.get_content_nodes(with_commentary=True)
for version in all_versions:
    print("{}: {}".format(version.title.encode('utf-8'), version.versionTitle.encode('utf-8')))
    version_altered = False
    try:
        idx = version.get_index()
        content_nodes = idx.nodes.get_leaf_nodes()
        for node in content_nodes:
            print(node)
            ja_text = JaggedTextArray(version.content_node(node))
            normalized = ja_text.normalize(terminal_depth=node.depth)
            if normalized: #only set things that were changed.
                version.sub_content(key_list=node.version_address(), value=ja_text.array())
                version_altered = True
        if version_altered: #only go through save if something actually changed
            version.save()
    except BookNameError as e:
        print("no index for {}".format(version.title.encode('utf-8')))