Example #1
0
def chars_to_delete(text):
    chars = []
    for c in text:
        for char in c:
            if char not in chars and non_tib_chars(char):
                chars.append(char)
    # add punctuation to be deleted
    chars.extend(['།', '༎', '༄', '༅', '༔', '༑'])
    return chars
import PyTib
from PyTib.common import non_tib_chars, open_file, write_file, tib_sort, bisect_left
import os
import re
in_path = 'input/'
out_path = 'output/'

new = []
for f in os.listdir(in_path):
    content = open_file(in_path+f).replace('༌', '་').split('\n')
    content = [a.strip() for a in content if a != '']
    # find all non-tibetan characters
    to_delete = []
    for c in content:
        for char in c:
            if char not in to_delete and non_tib_chars(char):
                to_delete.append(char)
    # add punctuation to be deleted
    to_delete.extend(['།', '༎', '༄', '༅', '༑'])

    # replace them with spaces
    text = []
    for r in range(len(content)-1):
        line = content[r]
        for t in to_delete:
            line = line.replace(t, ' ')
        text.append(re.sub(r'\s+', r' ', line))

    lexicon = []
    for t in text:
        lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != ''])