def __init__(self, parent): self.parent = parent self.lang = "bo" self.mode = "default" self.tagger = None self.tokenizer = None self._words = [] if not self.tokenizer: self.tokenizer = pybo.BoTokenizer('POS')
def check_pybo_bo_tokenizer(main): if 'pybo_bo_tokenizer' not in main.__dict__: main.pybo_bo_tokenizer = pybo.BoTokenizer('GMD')
class ValidityError(Exception): '''Raised when a validity check is not passed''' from collections import Counter import pybo as bo # 1. PREPARATION # 1.1. Initializing the tokenizer tok = bo.BoTokenizer('POS') # 1.2. Loading in text input_str = '༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །' # ------------------------- # 2. CREATING THE OBJECTS # 2.1. creating pre_processed object pre_processed = bo.PyBoTextChunks(input_str) # 2.2. creating tokens object tokens = tok.tokenize(input_str) # ------------------------- # 3. TESTING ALL CLASS OBJECT ATTRIBUTES # (this needs to be checking accuracy and not just function) # 3.1. testing pre processed attributes
import sys, os grandParentDir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(grandParentDir) from PyTib.common import open_file, write_file, tib_sort, is_sskrt, pre_process import PyTib import pybo import re from collections import defaultdict tok = pybo.BoTokenizer('GMD') # lex_path = '../PyTib/data/uncompound_lexicon.txt' # lexicon = open_file(lex_path).strip().split('\n') # lexicon = '\n'.join(tib_sort(list(set(lexicon)))) # write_file(lex_path, lexicon) def rawify(string): return re.sub(r'[0-9]+\.\s?\s?', '', string.replace('a', '').replace('\n', '')) def contains_sskrt(string): string = string.replace('#', '') has_sskrt = False syls = pre_process(string, mode='syls') for syl in syls: if has_sskrt == False and is_sskrt(syl): has_sskrt = True return has_sskrt
parser.add_argument('-o', required=True, help='folder to contain the output') parser.add_argument('-c', help='generates a conc file for each input file if "true"') if __name__ == '__main__': # args = parser.parse_args() in_dir = 'out' #args.i out_dir = 'segmented' #args.o gen_concs = False #bool(args.c) if not in_dir or not out_dir: parser.print_help() exit() pybo_mode = 'GMD' in_files = sorted(Path(in_dir).glob('*.txt')) out_dir = Path(out_dir) tok = pybo.BoTokenizer( pybo_mode) # GMD includes all available wordlists + sanskrit concs, sorted_types = generate_mistake_concs(in_files) if gen_concs: write_file_concs(concs, sorted_types, in_files, out_dir) write_total_concs(concs, sorted_types, out_dir) types = find_total_types(concs, sep='\t') Path(out_dir / 'mistake_types.txt').write_text('\n'.join(types))
def instanciate_tokenizer(): vocab_file = Path('input/lists/full-vocab.txt') parse_vocab_folder(str(vocab_file)) tok = pybo.BoTokenizer('POS', user_word_list=[str(vocab_file)]) os.remove(str(vocab_file)) return tok