def __init__(self, dict_path=None, name='Spelling_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None, tokenizer=None, reverse_tokenizer=None, include_reverse=True, stopwords_regex=None, verbose=0): super().__init__(action=Action.SUBSTITUTE, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, stopwords=stopwords, tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, device='cpu', verbose=verbose, stopwords_regex=stopwords_regex, include_detail=False) self.dict_path = dict_path if dict_path else os.path.join( LibraryUtil.get_res_dir(), 'word', 'spelling', 'spelling_en.txt') self.include_reverse = include_reverse self.model = self.get_model(force_reload=False)
def __init__(self, name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None, tokenizer=None, reverse_tokenizer=None, include_special_char=True, include_numeric=True, include_upper_case=True, lang="en", verbose=0, stopwords_regex=None, model_path=None, min_char=4, include_detail=False): super().__init__(action=Action.SUBSTITUTE, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max, aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p, tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu', verbose=verbose, stopwords_regex=stopwords_regex, include_special_char=include_special_char, include_detail=include_detail) # TODO: support other type of keyboard self.keyboard_type = 'qwerty' self.include_special_char = include_special_char self.include_numeric = include_numeric self.include_upper_case = include_upper_case self.include_lower_case = True self.lang = lang if model_path is None: if lang not in ['en', 'th']: raise ValueError( 'Only support en and th now. You may provide the keyboard mapping ' 'such that we can support "{}"'.format(lang)) self.model_path = os.path.join(LibraryUtil.get_res_dir(), 'char', 'keyboard', lang + '.json') else: self.model_path = model_path self.model = self.get_model(include_special_char, include_numeric, include_upper_case, lang, self.model_path)
def __init__(self, name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None, tokenizer=None, reverse_tokenizer=None, include_special_char=True, include_numeric=True, include_upper_case=True, lang="en", verbose=0, stopwords_regex=None, model_path=None, min_char=4): super().__init__( action=Action.SUBSTITUTE, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max, aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p, tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu', verbose=verbose, stopwords_regex=stopwords_regex, include_special_char=include_special_char, include_detail=False) # TODO: support other type of keyboard self.keyboard_type = 'qwerty' self.include_special_char = include_special_char self.include_numeric = include_numeric self.include_upper_case = include_upper_case self.include_lower_case = True self.lang = lang if model_path is None: lang_list = set( map( lambda file_name: file_name.replace(".json", ""), os.listdir( os.path.join(LibraryUtil.get_res_dir(), "char", "keyboard") ), ) ) if lang not in lang_list: raise ValueError( "Only support en and th now. You may provide the keyboard mapping " 'such that we can support "{}"'.format(lang) ) self.model_path = os.path.join( LibraryUtil.get_res_dir(), "char", "keyboard", lang + ".json" ) else: self.model_path = model_path self.model = self.get_model(include_special_char, include_numeric, include_upper_case, lang, self.model_path)
def get_model(cls, dict_of_path): # Use default if not dict_of_path: default_path = os.path.join(LibraryUtil.get_res_dir(), 'char', 'ocr', 'en.json') model = ReadUtil.read_json(default_path) return nmc.Ocr(model=model) # Use dict if type(dict_of_path) is dict: return nmc.Ocr(model=dict_of_path) # Use json from file model = ReadUtil.read_json(dict_of_path) if not model: raise ValueError('The dict_of_path does not exist. Please check "{}"'.format(dict_of_path)) return nmc.Ocr(model=model)