def generate_unified_version(editions): ''' :param editions: :return: a list with common syllables as separate elements, differing parts within a dict ''' total = [] # a. generate the list of editions’ names ed_names = [a for a in editions] for syl_num in range(1, len(editions['སྡེ་'])): pre_processed = {} common = [] # b. segment in syllables and seperate on the punctuation for each version for ed in ed_names: chunk = editions[ed][syl_num][0].replace('_', ' ') pre_processed[ed] = pre_process(chunk, mode='syls') # c. add to common the syls that are the same in all editions and leave the others in pre_processed while len({ pre_processed[ed][0] if pre_processed[ed] != [] else '' for ed in ed_names }) == 1: if pre_processed[ed_names[0]]: common.append(pre_processed[ed_names[0]][0]) for ed in ed_names: del pre_processed[ed][0] else: break total.extend(common) total.append(pre_processed) return total
def contains_sskrt(string): string = string.replace('#', '') has_sskrt = False syls = pre_process(string, mode='syls') for syl in syls: if has_sskrt == False and is_sskrt(syl): has_sskrt = True return has_sskrt
def reinsert_notes(raw_text, raw_notes, basis_edition='སྡེ་'): global note_num raw_text = raw_text.replace('a', '').replace('\t', ' ').split('\n') raw_notes = re.sub( r'《([^《》་]+)》', r'《\1་》', raw_notes) # add a tsek in the edition names that lack one. raw_notes = raw_notes.strip().split('\n')[1:] text = {} for t in raw_text: parts = re.split(r'([0-9]+)\.[\t\s]', t)[1:] if parts: note_number = parts[0] note_text = pre_process(parts[1], mode='syls') if note_text == []: note_text = [''] text[note_number] = note_text edition_regex = r'《([^《》]+)》' # finding all the editions that exist for that text edition_names = set( [e for r in raw_notes for e in re.findall(edition_regex, r)]) editions = {basis_edition: []} for e in edition_names: editions[e] = [] error = False for n in raw_notes: #if debug == 1: if show_note == 1: print('\t\t' + n) if error: break if n.replace(',', '').replace(' ', '') == '': continue parts = n.split(',') number = str(int(parts[2]) - 1) # DEBUG. Enables to start debugging at a given note #note_num = 304 if number == str(note_num - 1): print('ok') page_number = parts[1] content = parts[4:] note = '' # keep track of which edition has already been replaced generated_versions = {basis_edition: False} for e in edition_names: generated_versions[e] = False # loop through tuples of (edition-s, note) max_pairs = len(content) - 1 if max_pairs > len(content): max_pairs = len(content) - 1 tuple_idx = [c for c in range(0, max_pairs) if c % 2 == 0] for a in tuple_idx: if error: break if content[a]: # filters the cases where the second tuple is empty note = content[a + 1] if '(' in note: print('there is a note on top of the comparison.') print('\t'.join(parts)) note = note.split('(')[0].strip() if '《' in note: print( 'The following note needs to be edited. The execution will stop now.' ) print('\t'.join(parts)) error = True break # 0 prepare # separate in syllables not separating the fusioned particles modif_type = '' if note.startswith('m'): modif_type = 'm' elif note.startswith('p'): modif_type = 'p' version = pre_process(note.replace(modif_type, ''), mode='syls') # delete the last element in the list of the note #if is_punct(version[-1]): if is_punct(version[-1]) and len(version) > 1: del version[-1] # reconstitute the punctuation for comparing the syllables: # add a tsek to it if the original text has one # if the last syllable is not a punctuation if not version[-1].endswith('་'): if not is_punct(text[number][-1]): if not text[number][-1].endswith('་'): version[-1] += '་' # if the last syllable is a punctuation elif is_punct( text[number][-1]) and len(text[number]) > 1: if text[number][-2].endswith('་'): version[-1] += '་' # 1 find index # 1.a # find the index of the syllable from which to start replacing the original index = len(text[number]) - len(version) # go one syllable left if the last syllable of the original text is a punctuation if is_punct(text[number][-1]): index -= 1 # put the index at 0 if the replacement text is longer than the original if index < 0: index = 0 # 1.b # try to find a point of correspondence in case there are more than a few syllables that are added orig_sync_idx = False version_sync_idx = False window_size = 4 maximum = len(text[number]) - 1 # attempts_num becomes 0 if window_size is larger than the length of version, making window_indexes an empty list. # this way, window_size decides wether we search for a syncronisation point or not. attempts_num = len(version[window_size:]) window_indexes = [(a, a + window_size) for a in range(attempts_num)] # for v_w in window_indexes: # for a_n in range(attempts_num): # orig_window = text[number][maximum - window_size - a_n:maximum - a_n] # version_window = version[v_w[0]:v_w[1]] # if orig_window == version_window: # if not orig_sync_idx: # orig_sync_idx = maximum - window_size - a_n # version_sync_idx = v_w[0] # finding the sync point if it is the last syllable if not orig_sync_idx: # detects which of the two syls is the longest to check if both start the same way if len(text[number][-1]) > len(version[0]): long = text[number][-1] #long = ''.join(text[number][index:]) short = version[0].rstrip('་') #short = ''.join(version) else: long = version[0] short = text[number][-1].rstrip('་') # long = ''.join(version) # short = ''.join(text[number][index:]) # finds if long is short with an addition. This deals with བདེའང་ being replaced by བདེ་བའང་. # Todo: similar replacements may occur elsewhere than the last syllable. implementation needed. # in case both syllables are identical, the condition is also met. if short in long: if short + '་' != long and (len(version) == 1 or short == strip_particle( long.strip('་'))): if modif_type == 'p': # ,,9,4,《པེ་》《སྣར་》,pཔོ།, orig_sync_idx = len(text[number]) else: orig_sync_idx = len(text[number]) - 1 else: orig_sync_idx = index version_sync_idx = 0 elif strip_particle(long.strip('་')) == strip_particle( short.strip('་')): orig_sync_idx = len(text[number]) - 1 # 2 # generating the versions of the different editions edition_text = [b for b in text[number]] # A.1 for subsequent addition, keep the last syllable if it is a punctuation to add it at the end edition_text_last_syl = False if is_punct(edition_text[-1]): edition_text_last_syl = edition_text[-1] #orig_sync_idx -= 1 # as note's conjuction are removed # remove the ending tsek in version if it was not there in the original if edition_text[-1].endswith('་'): if not version[-1].endswith('་'): version[-1] += '་' if version[-1].endswith('ང'): version[-1] += '་' else: if version[-1].endswith( '་') and not version[-1].endswith('ང་'): version[-1] = version[-1].rstrip('་') # 2.1 if the operation is a deletion (m stands for minus) if modif_type == 'm': # a if there is a synchronizing point between the original and the version if orig_sync_idx: del edition_text[orig_sync_idx:] # b if there is no sync point else: if edition_text_last_syl: version.append(edition_text_last_syl) del edition_text[len(edition_text) - len(version):] # 2.2 if the operation is an addition (p stands for plus) elif modif_type == 'p': # a if there is a synchronizing point between the original and the version if orig_sync_idx: # replace the part that precedes the synchronising point edition_text[ orig_sync_idx - version_sync_idx: orig_sync_idx] = version[:version_sync_idx] # replacing from the synchronising point onwards edition_text[orig_sync_idx:orig_sync_idx] = version[ version_sync_idx:] # b if there is no sync point else: # add a tsek if there is none on the last syllable if not edition_text[-1].endswith( '་') and edition_text_last_syl != '།': edition_text[-1] += '་' # remove the ending tsek of version if version[-1].endswith('་'): version[-1] = version[-1].rstrip('་') edition_text.extend(version) # 2.3 if the operation is a replacement else: if orig_sync_idx: # replace the part that precedes the synchronising point edition_text[ orig_sync_idx - version_sync_idx: orig_sync_idx] = version[:version_sync_idx] # replacing from the synchronising point onwards edition_text[orig_sync_idx:] = version[ version_sync_idx:] # 2.b if there is no synchronising point else: backward_step = minus3_window(edition_text, version, index) index -= backward_step # if len(version)>1: # if edition_text[index+1] in version: # index +=1 # edition_text.append('') for e in range(len(version)): #print(e) # གཞུང་འདིའི་བསླབ་པ་ལ་ནི་བསླབ་པར་ དབུ་མ་རིན་པོ་ཆེའི་སྒྲོན་མ།.txt #print(version[e]) edition_text[index + e] = version[e] if backward_step: edition_text = edition_text[:-backward_step] # A.2 add the punctuation to the end if needed # if a punctuation was saved in A.1 and if it is not the same as the last syllable of edition_text if edition_text_last_syl and len(edition_text) > 0: if edition_text_last_syl != edition_text[-1]: # if the last syllable ends with a tsek if edition_text[-1].endswith('་'): # if there is a ང་ if not edition_text[-1].endswith('ང་'): edition_text[-1] = edition_text[-1][:-1] elif edition_text[-len(version)] == version[ -1] and edition_text[-len(version)] != '།།_།།': edition_text[-2] = edition_text[-1] edition_text[-1] = '' if edition_text[-len(version)] != '།།_།།': edition_text.append(edition_text_last_syl) # 2.4 if a sync point was found, i.e. if the size of version is longer than window_size, # add '%' to manually check the replacement has been correctly done #if orig_sync_idx: # edition_text[-1] += '%' # 3 Add the text to the respective editions # edition_refs = re.findall(edition_regex, content[a]) # 3.a add the versions of all the editions that require modifications from Derge and notify the edition is added for e in edition_refs: chunk = ''.join(edition_text) # remove the extra spaces inserted between the shad and the next verse chunk = chunk.replace('_།_', '_།').replace('_', ' ') editions[e].append( (chunk, len(version), page_number, note)) generated_versions[e] = True # 3.b add the original version of the text to the remaining for g in generated_versions: if not generated_versions[g]: chunk = ''.join(text[number]) # remove the extra spaces inserted between the shad and the next verse chunk = chunk.replace('_།_', '_།').replace('_', ' ') editions[g].append((chunk, '', page_number, note)) # 4 add the last bit of the text that corresponds to no note for g in editions: chunk = ''.join(text[str(len(text))]) chunk = chunk.replace('_།_', '_།').replace('_', ' ') editions[g].append((chunk, '', '', '')) return editions
def agree_zhas(chosen_ed): last_syl = pre_process(chosen_ed, mode='syls')[-1] return Agreement().part_agreement(last_syl, 'ཞེས')
return for c in combs(xs, i + 1): yield c yield c + (xs[i], ) input = [len(note_texts[a]) for a in note_texts] combinations = [a for a in combs(input) if len(a) == 2] count = {abs(a[0] - a[1]) for a in combinations} final = 0 for c in count: if c > final: final = c return final # 0. find parts note_texts = pre_process(note) content = {syl for ed in note_texts for syl in note_texts[ed] if syl != ''} if len(content) != 0: if is_tibetan_text(note_texts): # 1. process mistakes process_mistakes(note_texts) # 2. if the difference is a particle process_minor_modifications(note_texts) # 3. verb differences verb_difference(note_texts, verbs) # 4. well-formed non-words if not already_exists(categorised, note[0]):