def chars_to_delete(text): chars = [] for c in text: for char in c: if char not in chars and non_tib_chars(char): chars.append(char) # add punctuation to be deleted chars.extend(['།', '༎', '༄', '༅', '༔', '༑']) return chars
import PyTib from PyTib.common import non_tib_chars, open_file, write_file, tib_sort, bisect_left import os import re in_path = 'input/' out_path = 'output/' new = [] for f in os.listdir(in_path): content = open_file(in_path+f).replace('༌', '་').split('\n') content = [a.strip() for a in content if a != ''] # find all non-tibetan characters to_delete = [] for c in content: for char in c: if char not in to_delete and non_tib_chars(char): to_delete.append(char) # add punctuation to be deleted to_delete.extend(['།', '༎', '༄', '༅', '༑']) # replace them with spaces text = [] for r in range(len(content)-1): line = content[r] for t in to_delete: line = line.replace(t, ' ') text.append(re.sub(r'\s+', r' ', line)) lexicon = [] for t in text: lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != ''])