def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) """if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form'""" query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} form = WordForm().load(query_obj) if not form and lookup_ref: del query_obj["refs"] form = WordForm().load(query_obj) if form: result = [] headword_query = [] for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def get_from_word_2_ref(word, words_2_ref_nums, ref_num_min_N_title, ref_num_2_part, ref_num_2_full_name): ref_nums = words_2_ref_nums.get(word, []) print(('ref_nums', ref_nums)) ref_strs = [] for ref_num in ref_nums: for ref_num_min, temp_title in ref_num_min_N_title: if ref_num_min > ref_num: break title = temp_title full_ref_str = '{}{}'.format(title, ref_num_2_part[ref_num]) print(('full_ref_str', ref_num, full_ref_str)) ref_strs.append(full_ref_str) continue #TODO: test weird part refs and make more complete and test that word always shows up in texts if full_ref_str != ref_num_2_full_name[ref_num]: print(('DIFF:', )) print(('try', full_ref_str)) print(('real', ref_num_2_full_name[ref_num])) else: #print('same', full_ref_str) ref_strs.append(full_ref_str) for r in ref_strs: full_text = ' '.join(Ref(r).text('en').text) print(('text from ref search', r, word in full_text )) # full_text.replace(word, '_____' + word + '_____')) return ref_strs
def get_from_db(word): ref_results = [] conn = get_connection() # GET THE METADATA STUFF sql = 'SELECT * from {} WHERE _id like ?'.format(WORDS_2_REF_NUMS) _id, blob = conn.cursor().execute(sql, (METADATA_CHUNKS_PACKETSIZE,)).fetchall()[0] hex_str = make_little_endian(blob) chunk_size = int(hex_str[0:8], 16) PACKET_SIZE = int(hex_str[8:16], 16) # 3 * 8 # 3 bytes of bits * 8bits per byte print(_id, blob, hex_str, len(hex_str), chunk_size, PACKET_SIZE) # 200, 24 ... this looks correct ## sql = 'SELECT * from {} where _id like ?'.format(WORDS_2_REF_NUMS) cur = conn.cursor() cur.execute(sql, (word,)) for word_id, blob in cur.fetchall(): chunk_start_nums = [] hex_str = str(blob).encode("hex") print(hex_str) for index in range(0, len(hex_str), 8): # each hex is half a byte and 4 bytes to a JH_packet packet = hex_str[index:index+8] # get just the packet packet_index = int(packet[:2], 16) # first byte is the packet_index # take last 3 bytes as the bitstring of 0 vs. 1 if contains `_id` keyword packet_bits = bin(int(packet[2:], 16))[2:].zfill(PACKET_SIZE) # for each of the bytes reverse the bits inside of it (make little endian) packet_bits = ''.join([packet_bits[i - 8:i][::-1] for i in range(8, len(packet_bits) + 1, 8)]) for bit_index, bit in enumerate(packet_bits): if bit == '1': chunk_start_num = (packet_index * PACKET_SIZE + bit_index) * chunk_size chunk_start_nums.append(chunk_start_num) print(index, packet, packet_index, packet_bits, chunk_start_num) title_id = -1 for chunk_start_num in chunk_start_nums: # get all part names from chunk_start to chunk_start + size of the chunk sql = 'SELECT _rowid_, value from {} where _rowid_ BETWEEN ? AND ?'.format(REF_NUM_2_PART) cur.execute(sql, (chunk_start_num + 1, chunk_start_num + chunk_size + 1)) for ref_row_id, part in cur.fetchall(): ref_num = ref_row_id - 1 if ref_num > title_id: # get book title sql = 'SELECT * from {} where _id <= ? order by `_id` desc limit 1'.format(REF_NUM_MIN_N_TITLE) cur.execute(sql, (ref_num,)) rows = cur.fetchall() title_id, title = rows[0] ref_str = '{}, {}'.format(title, part) r = Ref(ref_str) text = str(r.text('en').text) if word_id in text: # the word_id should appear as is, in at least one of the texts within the chunk print('found word in', r) ref_results.append(r) return ref_results
def __init__(self, refRangeString, start, end, name): """ hosts ref range and acceptable date parameters to help with determining whether a date/ref combination meets criteria for following a schedule :param start: datetime :param end: datetime """ self.dateRange = DateRange(name, start, end) self.ref = Ref(refRangeString)
def test_post_to_default_node(self): text = { "text": [["BFoo", "PBar", "Dub Blitz"],["GGGlam", "BBBlam", "Ber Flam"]], "versionTitle": "test_default_node", "versionSource": "www.sefaria.org", "language": "en", } response = c.post("/api/texts/Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertTrue("error" not in data) subref = Ref("Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1.2.3") assert TextChunk(subref, "en", "test_default_node").text == "Ber Flam"
def test_url_regex(self): if USE_VARNISH: assert v.url_regex(Ref("Exodus 15")) == r'Exodus(\\.15$|\\.15\\.)' assert v.url_regex(Ref("Exodus 15:15-17")) == r'Exodus(\\.15\\.15$|\\.15\\.15\\.|\\.15\\.16$|\\.15\\.16\\.|\\.15\\.17$|\\.15\\.17\\.)' assert v.url_regex(Ref("Yoma 14a")) == r'Yoma(\\.14a$|\\.14a\\.)' assert v.url_regex(Ref("Yoma 14a:12-15")) == r'Yoma(\\.14a\\.12$|\\.14a\\.12\\.|\\.14a\\.13$|\\.14a\\.13\\.|\\.14a\\.14$|\\.14a\\.14\\.|\\.14a\\.15$|\\.14a\\.15\\.)' assert v.url_regex(Ref("Yoma")) == r'Yoma($|\\.)' assert v.url_regex(Ref("Rashi on Genesis 1.1")) == r'Rashi\\_on\\_Genesis(\\.1\\.1$|\\.1\\.1\\.)'
def refIsInRange(self, ref, timestamp): """ returns whether the ref criteria is met is filled and whether to continue checking refs against dateRefRange (bucket) """ processedRef = Ref(ref) if self.dateRange.start > timestamp: return False, True elif self.dateRange.end < timestamp: return False, False elif processedRef.span_size() >= 1 and processedRef.overlaps( self.ref): # does this need to be more precise? return True, True else: return False, False # in date range, not in ref range
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a # consonantal form was supplied in the first place, this optimizes queries. input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def test_post_new_text(self): """ Tests: post of index & that new index is in index/titles post and get of English text post and get of Hebrew text Verify that in-text ref is caught and made a link Verify that changing of in-text ref results in old link removed and new one added counts docs of both he and en index delete and its cascading """ # Post a new Index index = { "title": "Sefer Test", "titleVariants": ["The Book of Test"], "sectionNames": ["Chapter", "Paragraph"], "categories": ["Musar"], } response = c.post("/api/index/Sefer_Test", {'json': json.dumps(index)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertIn("titleVariants", data) self.assertIn(u'Sefer Test', data["titleVariants"]) response = c.get("/api/index/titles") data = json.loads(response.content) self.assertIn(u'Sefer Test', data["books"]) #test the toc is updated toc = json.loads(c.get("/api/index").content) tutils.verify_title_existence_in_toc(index['title'], index['categories']) # Post Text (with English citation) text = { "text": "As it is written in Job 3:14, waste places.", "versionTitle": "The Test Edition", "versionSource": "www.sefaria.org", "language": "en", } response = c.post("/api/texts/Sefer_Test.99.99", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertTrue("error" not in data) # Verify one link was auto extracted response = c.get('/api/texts/Sefer_Test.99.99') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual(1, len(data["commentary"])) # Verify Count doc was updated response = c.get('/api/counts/Sefer_Test') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertNotIn("error", data) self.assertEqual([1, 1], data["_en"]["availableCounts"]) self.assertEqual(1, data["_en"]["availableTexts"][98][98]) self.assertEqual(0, data["_en"]["availableTexts"][98][55]) # Update link in the text text = { "text": "As it is written in Job 4:10, The lions may roar and growl.", "versionTitle": "The Test Edition", "versionSource": "www.sefaria.org", "language": "en", } response = c.post("/api/texts/Sefer_Test.99.99", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertTrue("error" not in data) # Verify one link was auto extracted response = c.get('/api/texts/Sefer_Test.99.99') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual(1, len(data["commentary"])) self.assertEqual(data["commentary"][0]["ref"], 'Job 4:10') # Post Text (with Hebrew citation) text = { "text": 'כדכתיב: "לא תעשה לך פסל כל תמונה" כו (דברים ה ח)', "versionTitle": "The Hebrew Test Edition", "versionSource": "www.sefaria.org", "language": "he", } response = c.post("/api/texts/Sefer_Test.88.88", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) # Verify one link was auto extracted response = c.get('/api/texts/Sefer_Test.88.88') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual(1, len(data["commentary"])) # Verify count doc was updated response = c.get('/api/counts/Sefer_Test') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual([1, 1], data["_he"]["availableCounts"]) self.assertEqual(1, data["_he"]["availableTexts"][87][87]) self.assertEqual(0, data["_en"]["availableTexts"][87][87]) # Delete Test Index textRegex = Ref('Sefer Test').regex() IndexSet({"title": u'Sefer Test'}).delete() #Make sure that index was deleted, and that delete cascaded to: versions, counts, links, cache, #todo: notes?, reviews? self.assertEqual(0, IndexSet({"title": u'Sefer Test'}).count()) self.assertEqual(0, VersionSet({"title": u'Sefer Test'}).count()) self.assertEqual(0, VersionStateSet({"title": u'Sefer Test'}).count()) #todo: better way to do this? self.assertEqual(0, LinkSet({"refs": {"$regex": textRegex}}).count())