Exemple #1
0
    def get_refs(self, node):
        assert isinstance(node, SchemaNode)
        if node.is_leaf():
            nodes = [node]
        else:
            nodes = node.get_leaf_nodes()

        for leaf_node in nodes:
            assert isinstance(leaf_node, JaggedArrayNode)

            if leaf_node.sharedTitle is not None:
                term = Term().load({'name': leaf_node.sharedTitle})
                self.base_refs.append(Ref(term.ref))
                self.mei_refs.append(leaf_node.ref())

            else:
                for subref in leaf_node.ref().all_subrefs():
                    assert isinstance(subref, Ref)
                    if not subref.is_section_level(
                    ):  # Don't bother trying to match depth 1 texts
                        break

                    if subref.is_empty():
                        continue
                    else:
                        base_book = leaf_node.primary_title('en')
                        base_chapter = subref.sections[0]
                        self.base_refs.append(
                            Ref("{} {}".format(base_book, base_chapter)))
                        self.mei_refs.append(subref)
Exemple #2
0
 def _normalize(self):
     # Derived values - used to make downstream queries quicker
     self.datetime = datetime.utcfromtimestamp(self.time_stamp)
     try:
         r = Ref(self.ref)
         self.context_refs   = [r.normal() for r in r.all_context_refs()]
         self.categories     = r.index.categories
         self.authors        = getattr(r.index, "authors", [])
         self.is_sheet       = r.index.title == "Sheet"
         if self.is_sheet:
             self.sheet_id = r.sections[0]
         if not self.secondary and not self.is_sheet and getattr(self, "language", None) != "hebrew" and r.is_empty("en"):
             # logically, this would be on frontend, but easier here.
             self.language = "hebrew"
     except SheetNotFoundError:
         self.context_refs   = [self.ref]
         self.categories     = ["_unlisted"]
         self.authors        = []
         self.is_sheet       = True
     except InputError:   # Ref failed to resolve
         self.context_refs   = [self.ref]
         self.categories     = []
         self.authors        = []
         self.is_sheet       = False
     except KeyError:     # is_text_translated() stumbled on a bad version state
         pass
 def test_commentary(self):
     s = "Here's one with Rashi on Genesis 2:5:3"
     s2 = "Here's one with both Rashi on Genesis 3:4 and Exodus 5:2. yeah"
     s3 = "Here's one with Genesis 2:3"
     assert library.get_refs_in_string(s, "en") == [Ref("Rashi on Genesis 2:5:3")]
     assert library.get_refs_in_string(s2, "en") == [Ref("Rashi on Genesis 3:4"), Ref("Exodus 5:2")]
     assert library.get_refs_in_string(s3, "en") == [Ref("Genesis 2:3")]
    def test_two_single_quotes(self):
        ref = library.get_refs_in_string(u"עין ממש דכתיב (במדבר ל''ה) ולא תקחו")
        assert 1 == len(ref)
        assert ref[0] == Ref(u"במדבר ל''ה")

        ref = library.get_refs_in_string(u"דאמר קרא (שופטים כ י''א) ויאסף כל איש")
        assert 1 == len(ref)
        assert ref[0] == Ref(u"שופטים כ י''א")
	def set_recent_item(tref):
		"""
		Save `tref` as a recently viewed text at the front of the list. Removes any previous location for that text.
		Not used yet, need to consider if it's better to store derivable information (ref->heRef) or reprocess it often.
		"""
		oref = Ref(tref)
		recent = [tref for tref in self.recent if Ref(tref).index.title != oref.index.title]
		self.recent = [tref] + recent
		self.save()
Exemple #6
0
 def test_sefer_mitzvot(self):
     ref = library.get_refs_in_string(texts['neg327'])
     assert 4 == len(ref)
     assert {
         Ref(u'ויקרא טז,כט'),
         Ref(u'ויקרא כג,כח'),
         Ref(u'ויקרא כג,לא'),
         Ref(u'במדבר כט,ז')
     } == set(ref)
Exemple #7
0
 def set_version_by_category(self, book_name):
     book_ref = Ref(book_name)
     if book_ref.is_tanach():
         self.version_map[book_name] = 'Tanach with Text Only'
     elif book_ref.is_bavli():
         self.version_map[book_name] = 'Wikisource Talmud Bavli'
     else:
         raise AttributeError(
             '{} does not match a default category, use set_version_by_book instead'
             .format(book_name))
Exemple #8
0
def process_index_title_change_in_notes(indx, **kwargs):
    print "Cascading Notes {} to {}".format(kwargs['old'], kwargs['new'])
    pattern = Ref(indx.title).regex()
    pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"]))
    notes = NoteSet({"ref": {"$regex": pattern}})
    for n in notes:
        try:
            n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1)
            n.save()
        except Exception:
            logger.warning("Deleting note that failed to save: {}".format(n.ref))
            n.delete()
Exemple #9
0
def process_index_title_change_in_notes(indx, **kwargs):
    print "Cascading Notes {} to {}".format(kwargs['old'], kwargs['new'])
    pattern = Ref(indx.title).regex()
    pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"]))
    notes = NoteSet({"ref": {"$regex": pattern}})
    for n in notes:
        try:
            n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1)
            n.save()
        except Exception:
            logger.warning("Deleting note that failed to save: {}".format(
                n.ref))
            n.delete()
Exemple #10
0
def check_chapters():
    cards = get_cards()
    good_files, bad_files = [], []
    for card in cards:
        m_ref = Ref(card.replace('Rambam ', ''))
        with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile:
            tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7')
            tags = tester.grab_each_header()
        if len(tags) == len(
                m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot':
            good_files.append(card)
        else:
            bad_files.append(card)
    return {'good': good_files, 'bad': bad_files}
Exemple #11
0
    def test_add_expanded_refs(self, topic_graph):
        attrs = {
            'ref': 'Genesis 1:1',
            'toTopic': '6',
            'linkType': 'about',
            'dataSource': 'sefaria'
        }
        l = RefTopicLink(attrs)
        l.save()
        assert getattr(l, 'class') == 'refTopic'
        assert l.expandedRefs == ['Genesis 1:1']
        l.delete()

        attrs = {
            'ref': 'Genesis 1:1-3',
            'toTopic': '6',
            'linkType': 'about',
            'dataSource': 'sefaria'
        }
        l = RefTopicLink(attrs)
        l.save()
        assert l.expandedRefs == ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:3']
        l.delete()

        attrs = {
            'ref': 'Genesis 1-2',
            'toTopic': '6',
            'linkType': 'about',
            'dataSource': 'sefaria'
        }
        l = RefTopicLink(attrs)
        l.save()
        test_refs = [r.normal() for r in Ref('Genesis 1-2').all_segment_refs()]
        assert l.expandedRefs == test_refs
        l.delete()
Exemple #12
0
    def test_inner_parenthesis(self):

        ref = library.get_refs_in_string(u"Bereishit Rabbah (55:7)", "en")
        assert 1 == len(ref)
        assert ref[0] == Ref(u'Bereshit Rabbah 55:7')

        ''' Ranges not yet supported
Exemple #13
0
def check_chapters():
    cards = get_cards()
    good_files, bad_files = [], []
    for card in cards:
        m_ref = Ref(card.replace('Rambam ', ''))
        with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile:
            tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7')
            tags = tester.grab_each_header()
        if len(tags) == len(m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot':
            good_files.append(card)
        else:
            bad_files.append(card)
    return {
        'good': good_files,
        'bad': bad_files
    }
def process_index_title_change_in_manuscript_links(indx, **kwargs):
    from sefaria.system.exceptions import InputError

    print("Cascading ManuscriptPage from {} to {}".format(
        kwargs['old'], kwargs['new']))

    # ensure that the regex library we're using here is the same regex library being used in `Ref.regex`
    from .text import re as reg_reg
    patterns = [
        pattern.replace(reg_reg.escape(indx.title),
                        reg_reg.escape(kwargs["old"]))
        for pattern in Ref(indx.title).regex(as_list=True)
    ]
    queries = [{'expanded_refs': {'$regex': pattern}} for pattern in patterns]
    objs = ManuscriptPageSet({"$or": queries})
    for o in objs:
        o.contained_refs = [
            r.replace(kwargs["old"], kwargs["new"], 1) if reg_reg.search(
                '|'.join(patterns), r) else r for r in o.contained_refs
        ]
        o.expanded_refs = [
            r.replace(kwargs["old"], kwargs["new"], 1) if reg_reg.search(
                '|'.join(patterns), r) else r for r in o.expanded_refs
        ]
        try:
            o.save()
        except InputError:
            logger.warning("Failed to convert ref data from: {} to {}".format(
                kwargs['old'], kwargs['new']))
Exemple #15
0
    def _normalize(self):
        # Derived values - used to make downstream queries quicker
        self.datetime = datetime.utcfromtimestamp(self.time_stamp)
        try:
            r = Ref(self.ref)
            self.context_refs = [r.normal() for r in r.all_context_refs()]
            self.categories = r.index.categories
            self.authors = getattr(r.index, "authors", [])
            self.is_sheet = r.index.title == "Sheet"
            if self.is_sheet:
                self.sheet_id = r.sections[0]

        except InputError:  # Ref failed to resolve
            self.context_refs = [self.ref]
            self.categories = []
            self.authors = []
            self.is_sheet = False
Exemple #16
0
def topic_graph_to_merge():
    isa_links = [
        (10, 20),
        (20, 30),
        (20, 40),
        (40, 50),
        (60, 50),
    ]
    trefs = [r.normal() for r in Ref('Genesis 1:1-10').range_list()]
    trefs1 = [r.normal() for r in Ref('Exodus 1:1-10').range_list()]
    trefs2 = [r.normal() for r in Ref('Leviticus 1:1-10').range_list()]

    graph = {
        'topics': {str(i): make_topic(str(i))
                   for i in range(10, 100, 10)},
        'links':
        [make_it_link(str(a), str(b), 'is-a')
         for a, b in isa_links] + [make_rt_link('10', r) for r in trefs] +
        [make_rt_link('20', r)
         for r in trefs1] + [make_rt_link('40', r) for r in trefs2]
    }
    db.sheets.insert_one({
        "id":
        1234567890,
        "topics": [{
            "slug": '20',
            'asTyped': 'twenty'
        }, {
            "slug": '40',
            'asTyped': '4d'
        }, {
            "slug": '20',
            'asTyped': 'twent-e'
        }, {
            "slug": '30',
            'asTyped': 'thirty'
        }]
    })

    yield graph
    for k, v in graph['topics'].items():
        v.delete()
    for v in graph['links']:
        v.delete()
    db.sheets.delete_one({"id": 1234567890})
Exemple #17
0
    def save_history_item(cls, uid, hist, time_stamp=None):
        if time_stamp is None:
            time_stamp = epoch_time()
        hist["uid"] = uid
        if "he_ref" not in hist or "book" not in hist:
            oref = Ref(hist["ref"])
            hist["he_ref"] = oref.he_normal()
            hist["book"] = oref.index.title
        hist["server_time_stamp"] = time_stamp if "server_time_stamp" not in hist else hist["server_time_stamp"]  # DEBUG: helpful to include this field for debugging

        action = hist.pop("action", None)
        saved = True if action == "add_saved" else (False if action == "delete_saved" else hist.get("saved", False))
        uh = UserHistory(hist, load_existing=(action is not None), update_last_place=(action is None), field_updates={
            "saved": saved,
            "server_time_stamp": hist["server_time_stamp"]
        })
        uh.save()
        return uh
    def save_history_item(cls, uid, hist, time_stamp=None):
        if time_stamp is None:
            time_stamp = epoch_time()
        hist["uid"] = uid
        if "he_ref" not in hist or "book" not in hist:
            oref = Ref(hist["ref"])
            hist["he_ref"] = oref.he_normal()
            hist["book"] = oref.index.title
        hist["server_time_stamp"] = time_stamp if "server_time_stamp" not in hist else hist["server_time_stamp"]  # DEBUG: helpful to include this field for debugging

        action = hist.pop("action", None)
        saved = True if action == "add_saved" else (False if action == "delete_saved" else hist.get("saved", False))
        uh = UserHistory(hist, load_existing=(action is not None), update_last_place=(action is None), field_updates={
            "saved": saved,
            "server_time_stamp": hist["server_time_stamp"]
        })
        uh.save()
        return uh
Exemple #19
0
 def filter_invalid_sources(self):
     """ Remove any sources that don't validate """
     sources = []
     for source in self.sources:
         try:
             sources.append((Ref(source[0]).normal(), source[1]))
         except:
             pass
     self.sources = sources
Exemple #20
0
 def notes(self, tref=None):
     """
     Returns notes for this layer,
     optionally filtered by notes on ref.
     """
     query = {"_id": {"$in": self.note_ids}}
     if tref:
         query["ref"] = {"$regex": Ref(tref).section_ref().regex()}
     notes = NoteSet(query=query)
     return [note for note in notes]
Exemple #21
0
    def load_set_for_client(cls, tref: str):
        """
        This method returns an array of results that can be converted to JSON instead of Sefaria MongoSet instances.
        This method uses a mongo aggregation to JOIN the manuscript with the manuscript page.
        :param tref:
        :return:
        """
        try:
            oref = Ref(tref)
        except InputError:
            return []

        segment_refs = [r.normal() for r in oref.all_segment_refs()]
        results, manuscripts = [], {}
        documents = cls.load_by_ref(oref)

        for document in documents:
            contained_refs, expanded = document.contained_refs, document.expanded_refs
            anchor_ref_list, anchor_ref_expanded_list = oref.get_all_anchor_refs(
                segment_refs, contained_refs, expanded)

            for anchor_ref, anchor_ref_expanded in zip(
                    anchor_ref_list, anchor_ref_expanded_list):
                contents = document.contents()
                contents["anchorRef"] = anchor_ref.normal()
                contents["anchorRefExpanded"] = [
                    r.normal() for r in anchor_ref_expanded
                ]
                del contents['contained_refs']
                del contents['expanded_refs']

                if document.manuscript_slug in manuscripts:
                    manuscript = manuscripts[document.manuscript_slug]
                else:
                    manuscript = Manuscript().load(
                        {'slug': document.manuscript_slug})
                    manuscripts[manuscript.slug] = manuscript
                man_contents = manuscript.contents()
                contents['manuscript'] = man_contents

                results.append(contents)
        return results
Exemple #22
0
 def add_ref(self, tref):
     try:
         new_oref = Ref(tref)
     except InputError as e:
         raise ManuscriptError(e)
     for oref in self.get_ref_objects():
         if oref.overlaps(new_oref):
             raise ManuscriptError(
                 f'Overlap between contained refs {oref} and {new_oref}')
     self.contained_refs.append(tref)
     self.expanded_refs.extend(self.get_expanded_refs_for_source(new_oref))
 def contents(self, **kwargs):
     from sefaria.sheets import get_sheet_listing_data
     d = super(UserHistory, self).contents(**kwargs)
     if kwargs.get("for_api", False):
         keys = {
             'ref': '',
             'he_ref': '',
             'book': '',
             'versions': {},
             'time_stamp': 0,
             'saved': False,
             'delete_saved': False,
             'is_sheet': False,
             'sheet_id': -1,
             'sheet_owner': '',
             'sheet_title': '',
         }
         d = {
             key: d.get(key, default)
             for key, default in list(keys.items())
         }
     if kwargs.get("annotate", False):
         try:
             ref = Ref(d["ref"])
             if ref.is_sheet():
                 d.update(get_sheet_listing_data(d["sheet_id"]))
             else:
                 d["text"] = {
                     "en": TextChunk(ref, "en").as_sized_string(),
                     "he": TextChunk(ref, "he").as_sized_string()
                 }
         except Exception as e:
             logger.warning(
                 "Failed to retrieve text for history Ref: {}".format(
                     d['ref']))
             return d
     return d
Exemple #24
0
def process_index_title_change_in_user_history(indx, **kwargs):
    print "Cascading User History from {} to {}".format(kwargs['old'], kwargs['new'])

    # ensure that the regex library we're using here is the same regex library being used in `Ref.regex`
    from text import re as reg_reg
    patterns = [pattern.replace(reg_reg.escape(indx.title), reg_reg.escape(kwargs["old"]))
                for pattern in Ref(indx.title).regex(as_list=True)]
    queries = [{'ref': {'$regex': pattern}} for pattern in patterns]
    objs = UserHistorySet({"$or": queries})
    for o in objs:
        o.ref = o.ref.replace(kwargs["old"], kwargs["new"], 1)
        try:
            o.save()
        except InputError:
            logger.warning(u"Failed to convert user history from: {} to {}".format(kwargs['old'], kwargs['new']))
Exemple #25
0
    def test_link_set(self, topic_graph):
        ts = topic_graph['topics']
        ls = ts['1'].link_set(_class='intraTopic')
        assert list(ls)[0].topic == '2'
        assert ls.count() == 1

        ls = ts['4'].link_set(_class='intraTopic')
        assert {l.topic for l in ls} == {'2', '5'}

        trefs = {r.normal() for r in Ref('Genesis 1:1-10').range_list()}

        ls = ts['1'].link_set(_class='refTopic')
        assert {l.ref for l in ls} == trefs

        ls = ts['1'].link_set(_class=None)
        assert {getattr(l, 'ref', getattr(l, 'topic', None))
                for l in ls} == (trefs | {'2'})
Exemple #26
0
def test(book):
    qa_issues = open('Ibn Ezra on {} misalignments.txt'.format(book), 'w')
    levi = parse(file_data[book])
    vtitle = 'Devarim' if book == 'Deuteronomy' else book
    torat_emet = Ref("Ibn Ezra on {}".format(book)).text('he', 'Ibn Ezra on {} -- Torat Emet'.format(vtitle)).ja().array()
    count = 0
    for c_index, (my_chapter, thier_chapter) in enumerate(zip(levi, torat_emet)):
        for v_index, (my_verse, their_verse) in enumerate(zip(my_chapter, thier_chapter)):
            if len(my_verse) != len(their_verse):
                    qa_issues.write('issue found at {}:{}\n'.format(c_index+1, v_index+1))
                    count += 1
        if len(my_chapter) != len(thier_chapter):
            by_length = sorted((my_chapter, thier_chapter), key=lambda x:len(x))
            for i in range(len(by_length[0]), len(by_length[1])):
                qa_issues.write('issue found at {}:{}\n'.format(c_index+1, i+1))
                count += 1
    qa_issues.close()
    print '{} issues found'.format(count)
    ja_to_xml(levi, ['Chapter', 'Verse', 'Comment'])
Exemple #27
0
    def _validate(self):
        super(ManuscriptPage, self)._validate()

        # check that the manuscript this page is part of exists in the database
        if self.get_manuscript() is None:
            raise ManuscriptError("Manuscript missing in database")

        for tref in self.contained_refs:
            if not Ref.is_ref(tref):
                raise ManuscriptError(f'{tref} is not a valid Ref')

        test_refs = self.get_ref_objects()
        while test_refs:
            current_ref = test_refs.pop()
            for tr in test_refs:
                if current_ref.overlaps(tr):
                    raise ManuscriptError(
                        f'Overlap between contained refs {tr} and {current_ref}'
                    )

            if not len(test_refs):
                break
Exemple #28
0
 def xformer(recent):
     try:
         return {
             "uid": uid,
             "ref": recent[0],
             "he_ref": recent[1],
             "book": Ref(recent[0]).index.title,
             "last_place": True,
             "time_stamp": epoch_time(parser.parse(recent[2]).replace(tzinfo=None)) if recent[2] is not None else default_epoch_time,
             "server_time_stamp": epoch_time(parser.parse(recent[2]).replace(tzinfo=None)) if recent[2] is not None else default_epoch_time,
             "num_times_read": (recent[3] if recent[3] and isinstance(recent[3], int) else 1),  # we dont really know how long they've read this book. it's probably correlated with the number of times they opened the book
             "versions": {
                 "en": recent[4],
                 "he": recent[5]
             }
         }
     except InputError:
         return None
     except ValueError:
         return None
     except IndexError:
         return None
     except AttributeError:
         return None
Exemple #29
0
def parse_and_upload():
    cards = get_cards()
    links = []
    for card in cards:
        node = JaggedArrayNode()
        node.add_title(card, 'en', primary=True)
        node.add_title(u'רמב"ם ' +
                       Ref(card.replace('Rambam ', '')).he_normal(),
                       'he',
                       primary=True)
        node.key = card
        node.depth = 3
        node.addressTypes = ['Integer', 'Integer', 'Integer']
        node.sectionNames = ['Chapter', 'Mishnah', 'Comment']
        node.validate()
        node.toc_zoom = 2

        index = {
            'title': card,
            'categories': ['Commentary2', 'Mishnah', 'Rambam'],
            'schema': node.serialize(),
        }

        parsed = parser(card)
        links.extend(parsed['links'])
        version = {
            'versionTitle': u'Vilna Edition',
            'versionSource':
            'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001300957',
            'language': 'he',
            'text': parsed['parsed']
        }
        print 'posting {}'.format(card)
        post_index(index)
        post_text(card, version, index_count='on')
    post_link(links)
Exemple #30
0
def topic_graph():
    isa_links = [
        (1, 2),
        (2, 3),
        (2, 4),
        (4, 5),
        (6, 5),
    ]
    trefs = [r.normal() for r in Ref('Genesis 1:1-10').range_list()]
    for a, b in isa_links:
        clean_links(str(a))
        clean_links(str(b))
    graph = {
        'topics': {str(i): make_topic(str(i))
                   for i in range(1, 10)},
        'links':
        [make_it_link(str(a), str(b), 'is-a')
         for a, b in isa_links] + [make_rt_link('1', r) for r in trefs]
    }
    yield graph
    for k, v in graph['topics'].items():
        v.delete()
    for v in graph['links']:
        v.delete()
 def test_double_ref(self):
     ref = library.get_refs_in_string(texts['he_2ref'])
     assert 2 == len(ref)
     assert {Ref(u'הושע ט ג'), Ref(u'דברי הימים ב לב יט')} == set(ref)
Exemple #32
0
 def _normalize(self):
     self.ref = Ref(self.ref).normal()
     self.text = bleach.clean(self.text,
                              tags=self.allowed_tags,
                              attributes=self.allowed_attrs)
 def test_double_quote_talmud(self):
     ref = library.get_refs_in_string(texts['dq_talmud'])
     assert 1 == len(ref)
     assert Ref(u'יבמות ס"ה') == ref[0]