def check_demarcation(search_key): """ Sanity check function: make sure a certain search key can be used to find the beginning of the ktav yad rashi in text. Prints out files missing the search key, as well as number of files searched and number of keys found. :param search_key: A string indicating where ktav yad rashi begins. """ total, count = 0, 0 # loop through files for page in range(functions.get_page(72, 'b'), functions.get_page(94, 'a')+1): file_name = u'מנחות_{}.txt'.format(functions.get_daf(page)) rashi_file = codecs.open(file_name, 'r', 'utf-8') total += 1 found_key = False for line in rashi_file: if line.find(search_key) != -1: found_key = True count += 1 break if not found_key: print file_name rashi_file.close() print '{} files scanned, found key in {} file'.format(total, count)
def split_files(search_key): """ Loops through files, splitting Rashi and ktav yad rashi into 2 different files. Recommend running check_demarcation first. :param search_key: key to find end of Rashi and beginning of ktav yad rashi """ # loop through files for page in range(functions.get_page(72, 'b'), functions.get_page(94, 'a') + 1): file_name = u'מנחות_{}.txt'.format(functions.get_daf(page)) rashi = codecs.open(u'rashi_fixed/{}'.format(file_name), 'w', 'utf-8') ktav_yad_rashi = codecs.open(u'ktav_yad_rashi/{}'.format(file_name), 'w', 'utf-8') original = codecs.open(file_name, 'r', 'utf-8') found = False for line in original: if line.find(search_key) != -1: found = True if not found: rashi.write(line) if found: ktav_yad_rashi.write(line) original.close() rashi.close() ktav_yad_rashi.close()
def separate_ktav_yad_rashi(): """ :return: An dict named 'results' which contains lists of refs as described below. """ # set up a range of refs ref_range = Ref("Rashi on Menachot.72b-94a") ref = get_deepest_ref(Ref("Rashi on Menachot.72b")) results = { 'rashi': [], 'ktav yad rashi': [], 'found in both': [], 'found in none': [], } while ref_range.contains(ref): # open the files file_name = u'מנחות_{}.txt'.format(functions.get_daf(ref.sections[0]-2)) rashi = codecs.open(u'rashi_fixed/{}'.format(file_name), 'r', 'utf-8') ktav_yad_rashi = codecs.open(u'ktav_yad_rashi/{}'.format(file_name), 'r', 'utf-8') # look for ref in files in_rashi = find_TextChunk_in_file(ref, rashi) in_ktav_yad = find_TextChunk_in_file(ref, ktav_yad_rashi) if in_rashi and in_ktav_yad: results['found in both'].append(ref) elif not in_rashi and not in_ktav_yad: results['found in none'].append(ref) elif in_rashi and not in_ktav_yad: results['rashi'].append(ref) elif not in_rashi and in_ktav_yad: results['ktav yad rashi'].append(ref) # get next ref ref = ref.next_segment_ref() rashi.close() ktav_yad_rashi.close() return results