コード例 #1
0
    def _single_lookup(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            """if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'"""
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        form = WordForm().load(query_obj)
        if not form and lookup_ref:
            del query_obj["refs"]
            form = WordForm().load(query_obj)
        if form:
            result = []
            headword_query = []
            for lookup in form.lookups:
                headword_query.append({'headword': lookup['headword']})
                # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms
            return headword_query
        else:
            return []
コード例 #2
0
def get_from_word_2_ref(word, words_2_ref_nums, ref_num_min_N_title,
                        ref_num_2_part, ref_num_2_full_name):
    ref_nums = words_2_ref_nums.get(word, [])
    print(('ref_nums', ref_nums))
    ref_strs = []
    for ref_num in ref_nums:
        for ref_num_min, temp_title in ref_num_min_N_title:
            if ref_num_min > ref_num:
                break
            title = temp_title
        full_ref_str = '{}{}'.format(title, ref_num_2_part[ref_num])
        print(('full_ref_str', ref_num, full_ref_str))

        ref_strs.append(full_ref_str)
        continue
        #TODO: test weird part refs and make more complete and test that word always shows up in texts
        if full_ref_str != ref_num_2_full_name[ref_num]:
            print(('DIFF:', ))
            print(('try', full_ref_str))
            print(('real', ref_num_2_full_name[ref_num]))
        else:
            #print('same', full_ref_str)
            ref_strs.append(full_ref_str)

    for r in ref_strs:
        full_text = ' '.join(Ref(r).text('en').text)
        print(('text from ref search', r, word in full_text
               ))  # full_text.replace(word, '_____' + word + '_____'))

    return ref_strs
コード例 #3
0
def get_from_db(word):
    ref_results = []
    conn = get_connection()

    # GET THE METADATA STUFF
    sql = 'SELECT * from {} WHERE _id like ?'.format(WORDS_2_REF_NUMS)
    _id, blob = conn.cursor().execute(sql, (METADATA_CHUNKS_PACKETSIZE,)).fetchall()[0]
    hex_str = make_little_endian(blob)

    chunk_size = int(hex_str[0:8], 16)
    PACKET_SIZE = int(hex_str[8:16], 16) # 3 * 8 # 3 bytes of bits * 8bits per byte
    print(_id, blob, hex_str, len(hex_str), chunk_size, PACKET_SIZE) # 200, 24 ... this looks correct
    ##

    sql = 'SELECT * from {} where _id like ?'.format(WORDS_2_REF_NUMS)
    cur = conn.cursor()
    cur.execute(sql, (word,))

    for word_id, blob in cur.fetchall():
        chunk_start_nums = []
        hex_str = str(blob).encode("hex")
        print(hex_str)
        for index in range(0, len(hex_str), 8): # each hex is half a byte and 4 bytes to a JH_packet
            packet = hex_str[index:index+8] # get just the packet
            packet_index = int(packet[:2], 16) # first byte is the packet_index
            # take last 3 bytes as the bitstring of 0 vs. 1 if contains `_id` keyword
            packet_bits = bin(int(packet[2:], 16))[2:].zfill(PACKET_SIZE)
            # for each of the bytes reverse the bits inside of it (make little endian)
            packet_bits = ''.join([packet_bits[i - 8:i][::-1] for i in range(8, len(packet_bits) + 1, 8)])
            for bit_index, bit in enumerate(packet_bits):
                if bit == '1':
                    chunk_start_num = (packet_index * PACKET_SIZE + bit_index) * chunk_size
                    chunk_start_nums.append(chunk_start_num)
                    print(index, packet, packet_index, packet_bits, chunk_start_num)

        title_id = -1
        for chunk_start_num in chunk_start_nums:
            # get all part names from chunk_start to chunk_start + size of the chunk
            sql = 'SELECT _rowid_, value from {} where _rowid_ BETWEEN ? AND ?'.format(REF_NUM_2_PART)
            cur.execute(sql, (chunk_start_num + 1, chunk_start_num + chunk_size + 1))

            for ref_row_id, part in cur.fetchall():
                ref_num = ref_row_id - 1
                if ref_num > title_id:
                    # get book title
                    sql = 'SELECT * from {} where _id <= ? order by `_id` desc limit 1'.format(REF_NUM_MIN_N_TITLE)
                    cur.execute(sql, (ref_num,))
                    rows = cur.fetchall()
                    title_id, title = rows[0]

                ref_str = '{}, {}'.format(title, part)
                r = Ref(ref_str)
                text = str(r.text('en').text)
                if word_id in text: # the word_id should appear as is, in at least one of the texts within the chunk

                    print('found word in', r)
                    ref_results.append(r)
    return ref_results
コード例 #4
0
ファイル: trend.py プロジェクト: aquiandres/Sefaria-Project
 def __init__(self, refRangeString, start, end, name):
     """
     hosts ref range and acceptable date parameters to help with determining whether a date/ref combination meets
     criteria for following a schedule
     :param start: datetime
     :param end: datetime
     """
     self.dateRange = DateRange(name, start, end)
     self.ref = Ref(refRangeString)
コード例 #5
0
ファイル: tests.py プロジェクト: incarnated01/Sefaria-Project
 def test_post_to_default_node(self):
     text = {
         "text": [["BFoo", "PBar", "Dub Blitz"],["GGGlam", "BBBlam", "Ber Flam"]],
         "versionTitle": "test_default_node",
         "versionSource": "www.sefaria.org",
         "language": "en",
     }
     response = c.post("/api/texts/Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1", {'json': json.dumps(text)})
     self.assertEqual(200, response.status_code)
     data = json.loads(response.content)
     self.assertTrue("error" not in data)
     subref = Ref("Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1.2.3")
     assert TextChunk(subref, "en", "test_default_node").text == "Ber Flam"
コード例 #6
0
 def test_url_regex(self):
     if USE_VARNISH:
         assert v.url_regex(Ref("Exodus 15")) == r'Exodus(\\.15$|\\.15\\.)'
         assert v.url_regex(Ref("Exodus 15:15-17")) == r'Exodus(\\.15\\.15$|\\.15\\.15\\.|\\.15\\.16$|\\.15\\.16\\.|\\.15\\.17$|\\.15\\.17\\.)'
         assert v.url_regex(Ref("Yoma 14a")) == r'Yoma(\\.14a$|\\.14a\\.)'
         assert v.url_regex(Ref("Yoma 14a:12-15")) == r'Yoma(\\.14a\\.12$|\\.14a\\.12\\.|\\.14a\\.13$|\\.14a\\.13\\.|\\.14a\\.14$|\\.14a\\.14\\.|\\.14a\\.15$|\\.14a\\.15\\.)'
         assert v.url_regex(Ref("Yoma")) == r'Yoma($|\\.)'
         assert v.url_regex(Ref("Rashi on Genesis 1.1")) == r'Rashi\\_on\\_Genesis(\\.1\\.1$|\\.1\\.1\\.)'
コード例 #7
0
ファイル: trend.py プロジェクト: aquiandres/Sefaria-Project
 def refIsInRange(self, ref, timestamp):
     """
     returns whether the ref criteria is met is filled and whether to continue checking refs against dateRefRange (bucket)
     """
     processedRef = Ref(ref)
     if self.dateRange.start > timestamp:
         return False, True
     elif self.dateRange.end < timestamp:
         return False, False
     elif processedRef.span_size() >= 1 and processedRef.overlaps(
             self.ref):  # does this need to be more precise?
         return True, True
     else:
         return False, False  # in date range, not in ref range
コード例 #8
0
ファイル: lexicon.py プロジェクト: YisraelV/Sefaria-Project
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
コード例 #9
0
ファイル: lexicon.py プロジェクト: aquiandres/Sefaria-Project
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a
            # consonantal form was supplied in the first place, this optimizes queries.
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
コード例 #10
0
ファイル: tests.py プロジェクト: spenhos/Sefaria-Project
    def test_post_new_text(self):
        """
        Tests:
            post of index & that new index is in index/titles
            post and get of English text
            post and get of Hebrew text
            Verify that in-text ref is caught and made a link
            Verify that changing of in-text ref results in old link removed and new one added
            counts docs of both he and en
            index delete and its cascading
        """
        # Post a new Index
        index = {
            "title": "Sefer Test",
            "titleVariants": ["The Book of Test"],
            "sectionNames": ["Chapter", "Paragraph"],
            "categories": ["Musar"],
        }
        response = c.post("/api/index/Sefer_Test", {'json': json.dumps(index)})
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertIn("titleVariants", data)
        self.assertIn(u'Sefer Test', data["titleVariants"])

        response = c.get("/api/index/titles")
        data = json.loads(response.content)
        self.assertIn(u'Sefer Test', data["books"])

        #test the toc is updated
        toc = json.loads(c.get("/api/index").content)
        tutils.verify_title_existence_in_toc(index['title'],
                                             index['categories'])

        # Post Text (with English citation)
        text = {
            "text": "As it is written in Job 3:14, waste places.",
            "versionTitle": "The Test Edition",
            "versionSource": "www.sefaria.org",
            "language": "en",
        }
        response = c.post("/api/texts/Sefer_Test.99.99",
                          {'json': json.dumps(text)})
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertTrue("error" not in data)
        # Verify one link was auto extracted
        response = c.get('/api/texts/Sefer_Test.99.99')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual(1, len(data["commentary"]))
        # Verify Count doc was updated
        response = c.get('/api/counts/Sefer_Test')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertNotIn("error", data)
        self.assertEqual([1, 1], data["_en"]["availableCounts"])
        self.assertEqual(1, data["_en"]["availableTexts"][98][98])
        self.assertEqual(0, data["_en"]["availableTexts"][98][55])

        # Update link in the text
        text = {
            "text":
            "As it is written in Job 4:10, The lions may roar and growl.",
            "versionTitle": "The Test Edition",
            "versionSource": "www.sefaria.org",
            "language": "en",
        }
        response = c.post("/api/texts/Sefer_Test.99.99",
                          {'json': json.dumps(text)})
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertTrue("error" not in data)
        # Verify one link was auto extracted
        response = c.get('/api/texts/Sefer_Test.99.99')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual(1, len(data["commentary"]))
        self.assertEqual(data["commentary"][0]["ref"], 'Job 4:10')

        # Post Text (with Hebrew citation)
        text = {
            "text": 'כדכתיב: "לא תעשה לך פסל כל תמונה" כו (דברים ה ח)',
            "versionTitle": "The Hebrew Test Edition",
            "versionSource": "www.sefaria.org",
            "language": "he",
        }
        response = c.post("/api/texts/Sefer_Test.88.88",
                          {'json': json.dumps(text)})
        self.assertEqual(200, response.status_code)
        # Verify one link was auto extracted
        response = c.get('/api/texts/Sefer_Test.88.88')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual(1, len(data["commentary"]))
        # Verify count doc was updated
        response = c.get('/api/counts/Sefer_Test')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual([1, 1], data["_he"]["availableCounts"])
        self.assertEqual(1, data["_he"]["availableTexts"][87][87])
        self.assertEqual(0, data["_en"]["availableTexts"][87][87])

        # Delete Test Index
        textRegex = Ref('Sefer Test').regex()
        IndexSet({"title": u'Sefer Test'}).delete()

        #Make sure that index was deleted, and that delete cascaded to: versions, counts, links, cache,
        #todo: notes?, reviews?
        self.assertEqual(0, IndexSet({"title": u'Sefer Test'}).count())
        self.assertEqual(0, VersionSet({"title": u'Sefer Test'}).count())
        self.assertEqual(0, VersionStateSet({"title": u'Sefer Test'}).count())
        #todo: better way to do this?
        self.assertEqual(0, LinkSet({"refs": {"$regex": textRegex}}).count())