def test_segment(input_text): tokenizer = MaxMatchForwardTokenizer() tokenizer.load_model() result = tokenizer.segment(input_text) pytest.helpers.assert_token_equals(result, input_text)
def load_model(self): super(MaxMatchBidirectionalTokenizer, self).load_model() self.forward_tokenizer = MaxMatchForwardTokenizer(self.model_dir) self.forward_tokenizer.load_model() self.backward_tokenizer = MaxMatchBackwardTokenizer(self.model_dir) self.backward_tokenizer.load_model()
def test_persist(tmpdir): temp_path = tmpdir.mkdir("dag") temp_path_str = str(temp_path) tokenizer = MaxMatchForwardTokenizer() tokenizer.train_one_line(["我", "是", "中国人"]) tokenizer.train_one_line(["你", "打", "人"]) tokenizer.do_train() tokenizer.persist_to_dir(temp_path_str) assert len(temp_path.listdir()) == 1
def do_train(self): super(MaxMatchBidirectionalTokenizer, self).do_train() dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data) reverse_dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data, reverse=True) self.forward_tokenizer = MaxMatchForwardTokenizer(dict_data=dict_data) self.backward_tokenizer = MaxMatchBackwardTokenizer( dict_data=reverse_dict_data)
def test_train(input_text): tokenizer = MaxMatchForwardTokenizer() tokenizer.train_one_line(["我", "是", "中国人"]) tokenizer.train_one_line(["你", "打", "人"]) tokenizer.do_train() result = tokenizer.segment(input_text) pytest.helpers.assert_token_equals(result, input_text)
def from_disk(self, model_path, tokenizer_list, *args, **kwargs): # type: (str, List[BaseTokenizer]) -> None backward_tokenizer = MaxMatchBackwardTokenizer() forward_tokenizer = MaxMatchForwardTokenizer() backward_loader = BackwardDictionaryBasedLoader.instance() forward_loader = ForwardDictionaryBasedLoader.instance() backward_loader.from_disk(None, [backward_tokenizer]) forward_loader.from_disk(None, [forward_tokenizer]) for tokenizer in tokenizer_list: tokenizer.assign_from_loader(backward_tokenizer=backward_tokenizer, forward_tokenizer=forward_tokenizer)
class TokenizerLoader(object): Defaults = BaseDefaults factories = { 'max_match_forward_tokenizer': lambda nlp, **cfg: MaxMatchForwardTokenizer(**cfg), 'max_match_backward_tokenizer': lambda nlp, **cfg: MaxMatchBackwardTokenizer(**cfg), 'max_match_bidirectional_tokenizer': lambda nlp, **cfg: MaxMatchBidirectionalTokenizer(**cfg), 'dag_tokenizer': lambda nlp, **cfg: DAGTokenizer(**cfg), 'hmm_tokenizer': lambda nlp, **cfg: HMMTokenizer(**cfg), 'crf_tokenizer': lambda nlp, **cfg: CRFTokenizer(**cfg) } def __init__(self, meta=None, **kwargs): self.meta = {} if meta is None else meta self.tokenizers = {} def create_tokenizer(self, name, config=dict()): """Create a pipeline component from a factory. name (unicode): Factory name to look up in `Language.factories`. config (dict): Configuration parameters to initialise component. RETURNS (callable): Pipeline component. """ if name not in self.factories: raise KeyError(Errors.E002.format(name=name)) factory = self.factories[name] return factory(self, **config) def add_tokenizer(self, component, name=None): if issubclass(BaseTokenizer, component.__class__): msg = Errors.E003.format(component=repr(component), name=name) if isinstance(component, basestring_) and component in self.factories: msg += Errors.E004.format(component=component) raise ValueError(msg) if name is None: if hasattr(component, 'name'): name = component.name elif hasattr(component, '__name__'): name = component.__name__ elif (hasattr(component, '__class__') and hasattr(component.__class__, '__name__')): name = component.__class__.__name__ else: name = repr(component) if name in self.tokenizers: raise ValueError( Errors.E007.format(name=name, opts=self.tokenizers.keys())) self.tokenizers[name] = component def from_disk(self, path, disable=tuple()): path = util.ensure_path(path) deserializers = OrderedDict() loader_name_to_tokenizer = defaultdict(list) loader_name_to_class = dict() loader_name_to_instance = dict() for name, tokenizer in self.tokenizers.items(): if name in disable: continue # TODO: why using this in spacy # if not hasattr(tokenizer, 'to_disk'): # continue loader_class = tokenizer.get_loader() loader_name = loader_class.get_name() loader_name_to_tokenizer[loader_name].append(tokenizer) if name not in loader_name_to_class: loader_name_to_class[loader_name] = loader_class for loader_name, loader_class in loader_name_to_class.items(): loader_config = self.meta.get('loader_config', {}).get(loader_name, {}) loader_name_to_instance[loader_name] = loader_class.instance( **loader_config) for loader_name, tokenizer in loader_name_to_tokenizer.items(): loader_instance = loader_name_to_instance[loader_name] # if hasattr(loader_instance, 'skip_load_from_disk'): # continue deserializers[ loader_name] = lambda p, loader_instance=loader_instance, tokenizer=tokenizer: loader_instance.from_disk( p, tokenizer), loader_instance.get_model_dir() exclude = {p: False for p in disable} util.from_disk(path, deserializers, exclude) return self def get_tokenizer(self): def assemble_max_match_bidirectional_tokenizer(forward_tokenizer, backward_tokenizer): if forward_tokenizer and backward_tokenizer: max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer( ) max_match_bidirectional_tokenizer.forward_tokenizer = forward_tokenizer max_match_bidirectional_tokenizer.backward_tokenizer = backward_tokenizer return max_match_bidirectional_tokenizer return None forward_tokenizer = self.tokenizers.get('max_match_forward_tokenizer') backward_tokenizer = self.tokenizers.get( 'max_match_backward_tokenizer') tokenizer = Tokenizer() tokenizer.max_match_forward_tokenizer = forward_tokenizer tokenizer.max_match_backward_tokenizer = backward_tokenizer tokenizer.max_match_bidirectional_tokenizer = assemble_max_match_bidirectional_tokenizer( forward_tokenizer, backward_tokenizer) tokenizer.hmm_tokenizer = self.tokenizers.get('hmm_tokenizer') tokenizer.dag_tokenizer = self.tokenizers.get('dag_tokenizer') tokenizer.crf_tokenizer = self.tokenizers.get('crf_tokenizer') return tokenizer
def init_max_match_forward_tokenizer(self): if self.max_match_forward_tokenizer is None: self.max_match_forward_tokenizer = MaxMatchForwardTokenizer() self.max_match_forward_tokenizer.load_model()
class Tokenizer(object): def __init__(self, model_dir=None): if model_dir is None: model_dir = default_model_dir self.model_dir = model_dir self.dag_tokenizer = None # type: DAGTokenizer self.hmm_tokenizer = None # type: HMMTokenizer self.max_match_forward_tokenizer = None # type: MaxMatchForwardTokenizer self.max_match_backward_tokenizer = None # type: MaxMatchBackwardTokenizer self.max_match_bidirectional_tokenizer = None # type: MaxMatchBidirectionalTokenizer self.crf_tokenizer = None # type: CRFTokenizer def init_dag_tokenizer(self): if self.dag_tokenizer is None: self.dag_tokenizer = DAGTokenizer(self.model_dir) self.dag_tokenizer.load_model() def cut_by_DAG(self, message): self.init_dag_tokenizer() return self.dag_tokenizer.segment(message) def init_hmm_tokenizer(self): if self.hmm_tokenizer is None: self.hmm_tokenizer = HMMTokenizer(self.model_dir) self.hmm_tokenizer.load_model() def cut_by_HMM(self, message): self.init_hmm_tokenizer() return self.hmm_tokenizer.segment(message) def cut_by_joint_model(self, message): solutions = [self.cut_by_DAG(message), self.cut_by_HMM(message)] merge_solutions = MergeSolutions() best_solution = merge_solutions.merge(solutions) return best_solution cut = cut_by_DAG def init_max_match_forward_tokenizer(self): if self.max_match_forward_tokenizer is None: self.max_match_forward_tokenizer = MaxMatchForwardTokenizer() self.max_match_forward_tokenizer.load_model() def cut_by_max_match_forward(self, message): self.init_max_match_forward_tokenizer() return self.max_match_forward_tokenizer.segment(message) def init_max_match_backward_tokenizer(self): if self.max_match_backward_tokenizer is None: self.max_match_backward_tokenizer = MaxMatchBackwardTokenizer() self.max_match_backward_tokenizer.load_model() def cut_by_max_match_backward(self, message): self.init_max_match_backward_tokenizer() return self.max_match_backward_tokenizer.segment(message) def init_max_match_bidirectional_tokenizer(self): if self.max_match_bidirectional_tokenizer is None: self.max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer( ) self.max_match_bidirectional_tokenizer.load_model() def cut_by_max_match_bidirectional(self, message): self.init_max_match_bidirectional_tokenizer() return self.max_match_bidirectional_tokenizer.segment(message) def init_crf_tokenizer(self): if self.crf_tokenizer is None: self.crf_tokenizer = CRFTokenizer() self.crf_tokenizer.load_model() def cut_by_CRF(self, message): self.init_crf_tokenizer() return self.crf_tokenizer.segment(message) def load_custom_dict(self, dict_file): # TODO: not implement yet pass def add_word(self, word, freq=None): # TODO: not implement yet pass def del_word(self, word): # TODO: not implement yet pass def load_user_dict(self, dict_file): return self.dag_tokenizer.dict_data.load_user_dict(dict_file) @property def mini_log_freq(self): # TODO: not implement yet pass @property def average_log_freq(self): # TODO: not implement yet pass
class MaxMatchBidirectionalTokenizer(BaseDictionaryBasedTokenizer): def __init__(self, *args, **kwargs): super(MaxMatchBidirectionalTokenizer, self).__init__(*args, **kwargs) self.forward_tokenizer = None self.backward_tokenizer = None def do_train(self): super(MaxMatchBidirectionalTokenizer, self).do_train() dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data) reverse_dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data, reverse=True) self.forward_tokenizer = MaxMatchForwardTokenizer(dict_data=dict_data) self.backward_tokenizer = MaxMatchBackwardTokenizer( dict_data=reverse_dict_data) def load_model(self): super(MaxMatchBidirectionalTokenizer, self).load_model() self.forward_tokenizer = MaxMatchForwardTokenizer(self.model_dir) self.forward_tokenizer.load_model() self.backward_tokenizer = MaxMatchBackwardTokenizer(self.model_dir) self.backward_tokenizer.load_model() def segment(self, message): forward_token = self.forward_tokenizer.segment(message) backward_token = self.backward_tokenizer.segment(message) token_result = [forward_token, backward_token] token_count = operator.le(*map(self.compute_token_count, token_result)) token_granularity = operator.ge( *map(self.compute_token_granularity, token_result)) token_len_variability = operator.le( *map(self.compute_token_len_variability, token_result)) if token_count + token_granularity + token_len_variability >= 2: return forward_token else: return backward_token @staticmethod def compute_token_granularity(token_list): return sum(map(lambda x: len(x), token_list)) / len(token_list) @staticmethod def compute_token_oov_rate(token_list): # FIXME: method is_oov() is not exits yet return sum(map(lambda x: x.is_oov, token_list)) / len(token_list) @staticmethod def compute_token_count(token_list): return len(token_list) @staticmethod def compute_token_len_variability(token_list): mean_length = sum(map(lambda x: len(x), token_list)) / len(token_list) return sum(map(lambda x: abs(len(x) - mean_length)**2, token_list)) / len(token_list) def get_loader(self): return BidirectionalDictionaryBasedLoader def assign_from_loader(self, *args, **kwargs): self.forward_tokenizer = kwargs['forward_tokenizer'] self.backward_tokenizer = kwargs['backward_tokenizer']