def process_version_title_change_in_search(ver, **kwargs): from sefaria.local_settings import SEARCH_INDEX_ON_SAVE if SEARCH_INDEX_ON_SAVE: from sefaria.search import delete_version, TextIndexer, get_new_and_current_index_names search_index_name = get_new_and_current_index_names()['current'] search_index_name_merged = get_new_and_current_index_names(merged=True)['current'] text_index = library.get_index(ver.title) delete_version(text_index, kwargs.get("old"), ver.language) for ref in text_index.all_segment_refs(): TextIndexer.index_ref(search_index_name, ref, kwargs.get("new"), ver.language, False) TextIndexer.index_ref(search_index_name_merged, ref, None, ver.language, True)
def process_version_title_change_in_search(ver, **kwargs): from sefaria.local_settings import SEARCH_INDEX_ON_SAVE if SEARCH_INDEX_ON_SAVE: from sefaria.search import delete_version, TextIndexer, get_new_and_current_index_names search_index_name = get_new_and_current_index_names("text")['current'] # no reason to deal with merged index since versions don't exist. still leaving this here in case it is necessary # search_index_name_merged = get_new_and_current_index_names("merged")['current'] text_index = library.get_index(ver.title) delete_version(text_index, kwargs.get("old"), ver.language) for ref in text_index.all_segment_refs(): TextIndexer.index_ref(search_index_name, ref, kwargs.get("new"), ver.language, False)
def parse_lib_to_json(cls, start, end): print('parse_lib', start, end) cls.ref_num_min_N_title = [] # min ref_num of each book.title [[min_ref_num, book_title], ...] cls.ref_num = 0 # absolute index num for all refs cls.curr_title = None cls.curr_section = None cls.curr_section_text = u"" # only used for debuging ( cls.ref_num_2_full_name = [] # index of list is ref_num (implicitly) ["intro to bookA", 1, 2, 3, "Intro to bookB", ...] cls.ref_num_2_part = [] # dict of words: list of all ref_nums which that words appears in cls.words_2_ref_nums = defaultdict(set) TextIndexer.index_all("", True, for_es=False, action=OfflineTextIndexer.index_segment) # after it's done there's likely an extra section that hasn't been indexed cls.index_section(cls.curr_title, cls.section_ref, cls.curr_section_text) """ indexes = library.all_index_records() indexes = indexes[start:end] print("Running on {} indexes".format(len(indexes))) last_time = time.time() for i, index in enumerate(indexes): title = index.title print(i, str(dt.now().time()), index.title, time.time() - last_time) last_time = time.time() sys.stdout.flush() ref_num_min_N_title.append((ref_num, title,)) try: section_refs = index.all_section_refs() for section_ref in section_refs: # remove the title from the section_ref ref_part = re.sub(ur'^{}'.format(re.escape(title)), u'', section_ref.normal()) ref_num_2_full_name.append(section_ref.normal()) ref_num_2_part.append(ref_part) add_words(section_ref, words_2_ref_nums, ref_num) ref_num += 1 except InputError as e: print('ERROR', e) print('saving to json...') save(REF_NUM_MIN_N_TITLE, ref_num_min_N_title) save(REF_NUM_2_PART, ref_num_2_part) """ # convert sets to lists for json words_2_ref_nums = {key: sorted(list(value)) for key, value in words_2_ref_nums.iteritems()} save(WORDS_2_REF_NUMS, words_2_ref_nums) save(_ONLY_WORDS_LIST, words_2_ref_nums.keys())