예제 #1
0
def test_segment(input_text):
    tokenizer = MaxMatchForwardTokenizer()
    tokenizer.load_model()

    result = tokenizer.segment(input_text)

    pytest.helpers.assert_token_equals(result, input_text)
예제 #2
0
    def load_model(self):
        super(MaxMatchBidirectionalTokenizer, self).load_model()

        self.forward_tokenizer = MaxMatchForwardTokenizer(self.model_dir)
        self.forward_tokenizer.load_model()

        self.backward_tokenizer = MaxMatchBackwardTokenizer(self.model_dir)
        self.backward_tokenizer.load_model()
예제 #3
0
def test_persist(tmpdir):
    temp_path = tmpdir.mkdir("dag")
    temp_path_str = str(temp_path)

    tokenizer = MaxMatchForwardTokenizer()
    tokenizer.train_one_line(["我", "是", "中国人"])
    tokenizer.train_one_line(["你", "打", "人"])
    tokenizer.do_train()
    tokenizer.persist_to_dir(temp_path_str)

    assert len(temp_path.listdir()) == 1
예제 #4
0
    def do_train(self):
        super(MaxMatchBidirectionalTokenizer, self).do_train()

        dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data)

        reverse_dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data,
                                          reverse=True)

        self.forward_tokenizer = MaxMatchForwardTokenizer(dict_data=dict_data)

        self.backward_tokenizer = MaxMatchBackwardTokenizer(
            dict_data=reverse_dict_data)
예제 #5
0
def test_train(input_text):
    tokenizer = MaxMatchForwardTokenizer()
    tokenizer.train_one_line(["我", "是", "中国人"])
    tokenizer.train_one_line(["你", "打", "人"])
    tokenizer.do_train()

    result = tokenizer.segment(input_text)

    pytest.helpers.assert_token_equals(result, input_text)
    def from_disk(self, model_path, tokenizer_list, *args, **kwargs):
        # type: (str, List[BaseTokenizer]) -> None

        backward_tokenizer = MaxMatchBackwardTokenizer()
        forward_tokenizer = MaxMatchForwardTokenizer()

        backward_loader = BackwardDictionaryBasedLoader.instance()
        forward_loader = ForwardDictionaryBasedLoader.instance()

        backward_loader.from_disk(None, [backward_tokenizer])
        forward_loader.from_disk(None, [forward_tokenizer])

        for tokenizer in tokenizer_list:
            tokenizer.assign_from_loader(backward_tokenizer=backward_tokenizer,
                                         forward_tokenizer=forward_tokenizer)
예제 #7
0
class TokenizerLoader(object):
    Defaults = BaseDefaults

    factories = {
        'max_match_forward_tokenizer':
        lambda nlp, **cfg: MaxMatchForwardTokenizer(**cfg),
        'max_match_backward_tokenizer':
        lambda nlp, **cfg: MaxMatchBackwardTokenizer(**cfg),
        'max_match_bidirectional_tokenizer':
        lambda nlp, **cfg: MaxMatchBidirectionalTokenizer(**cfg),
        'dag_tokenizer':
        lambda nlp, **cfg: DAGTokenizer(**cfg),
        'hmm_tokenizer':
        lambda nlp, **cfg: HMMTokenizer(**cfg),
        'crf_tokenizer':
        lambda nlp, **cfg: CRFTokenizer(**cfg)
    }

    def __init__(self, meta=None, **kwargs):
        self.meta = {} if meta is None else meta
        self.tokenizers = {}

    def create_tokenizer(self, name, config=dict()):
        """Create a pipeline component from a factory.

        name (unicode): Factory name to look up in `Language.factories`.
        config (dict): Configuration parameters to initialise component.
        RETURNS (callable): Pipeline component.
        """
        if name not in self.factories:
            raise KeyError(Errors.E002.format(name=name))
        factory = self.factories[name]
        return factory(self, **config)

    def add_tokenizer(self, component, name=None):
        if issubclass(BaseTokenizer, component.__class__):
            msg = Errors.E003.format(component=repr(component), name=name)
            if isinstance(component,
                          basestring_) and component in self.factories:
                msg += Errors.E004.format(component=component)
            raise ValueError(msg)
        if name is None:
            if hasattr(component, 'name'):
                name = component.name
            elif hasattr(component, '__name__'):
                name = component.__name__
            elif (hasattr(component, '__class__')
                  and hasattr(component.__class__, '__name__')):
                name = component.__class__.__name__
            else:
                name = repr(component)
        if name in self.tokenizers:
            raise ValueError(
                Errors.E007.format(name=name, opts=self.tokenizers.keys()))

        self.tokenizers[name] = component

    def from_disk(self, path, disable=tuple()):
        path = util.ensure_path(path)
        deserializers = OrderedDict()
        loader_name_to_tokenizer = defaultdict(list)
        loader_name_to_class = dict()
        loader_name_to_instance = dict()
        for name, tokenizer in self.tokenizers.items():
            if name in disable:
                continue

            # TODO: why using this in spacy
            # if not hasattr(tokenizer, 'to_disk'):
            #     continue

            loader_class = tokenizer.get_loader()
            loader_name = loader_class.get_name()
            loader_name_to_tokenizer[loader_name].append(tokenizer)

            if name not in loader_name_to_class:
                loader_name_to_class[loader_name] = loader_class

        for loader_name, loader_class in loader_name_to_class.items():
            loader_config = self.meta.get('loader_config',
                                          {}).get(loader_name, {})
            loader_name_to_instance[loader_name] = loader_class.instance(
                **loader_config)

        for loader_name, tokenizer in loader_name_to_tokenizer.items():
            loader_instance = loader_name_to_instance[loader_name]

            # if hasattr(loader_instance, 'skip_load_from_disk'):
            #     continue

            deserializers[
                loader_name] = lambda p, loader_instance=loader_instance, tokenizer=tokenizer: loader_instance.from_disk(
                    p, tokenizer), loader_instance.get_model_dir()

        exclude = {p: False for p in disable}
        util.from_disk(path, deserializers, exclude)
        return self

    def get_tokenizer(self):
        def assemble_max_match_bidirectional_tokenizer(forward_tokenizer,
                                                       backward_tokenizer):
            if forward_tokenizer and backward_tokenizer:
                max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer(
                )
                max_match_bidirectional_tokenizer.forward_tokenizer = forward_tokenizer
                max_match_bidirectional_tokenizer.backward_tokenizer = backward_tokenizer

                return max_match_bidirectional_tokenizer

            return None

        forward_tokenizer = self.tokenizers.get('max_match_forward_tokenizer')
        backward_tokenizer = self.tokenizers.get(
            'max_match_backward_tokenizer')

        tokenizer = Tokenizer()
        tokenizer.max_match_forward_tokenizer = forward_tokenizer
        tokenizer.max_match_backward_tokenizer = backward_tokenizer
        tokenizer.max_match_bidirectional_tokenizer = assemble_max_match_bidirectional_tokenizer(
            forward_tokenizer, backward_tokenizer)
        tokenizer.hmm_tokenizer = self.tokenizers.get('hmm_tokenizer')
        tokenizer.dag_tokenizer = self.tokenizers.get('dag_tokenizer')
        tokenizer.crf_tokenizer = self.tokenizers.get('crf_tokenizer')

        return tokenizer
예제 #8
0
 def init_max_match_forward_tokenizer(self):
     if self.max_match_forward_tokenizer is None:
         self.max_match_forward_tokenizer = MaxMatchForwardTokenizer()
         self.max_match_forward_tokenizer.load_model()
예제 #9
0
class Tokenizer(object):
    def __init__(self, model_dir=None):
        if model_dir is None:
            model_dir = default_model_dir

        self.model_dir = model_dir

        self.dag_tokenizer = None  # type: DAGTokenizer
        self.hmm_tokenizer = None  # type: HMMTokenizer
        self.max_match_forward_tokenizer = None  # type: MaxMatchForwardTokenizer
        self.max_match_backward_tokenizer = None  # type: MaxMatchBackwardTokenizer
        self.max_match_bidirectional_tokenizer = None  # type: MaxMatchBidirectionalTokenizer
        self.crf_tokenizer = None  # type: CRFTokenizer

    def init_dag_tokenizer(self):
        if self.dag_tokenizer is None:
            self.dag_tokenizer = DAGTokenizer(self.model_dir)
            self.dag_tokenizer.load_model()

    def cut_by_DAG(self, message):
        self.init_dag_tokenizer()
        return self.dag_tokenizer.segment(message)

    def init_hmm_tokenizer(self):
        if self.hmm_tokenizer is None:
            self.hmm_tokenizer = HMMTokenizer(self.model_dir)
            self.hmm_tokenizer.load_model()

    def cut_by_HMM(self, message):
        self.init_hmm_tokenizer()
        return self.hmm_tokenizer.segment(message)

    def cut_by_joint_model(self, message):
        solutions = [self.cut_by_DAG(message), self.cut_by_HMM(message)]
        merge_solutions = MergeSolutions()
        best_solution = merge_solutions.merge(solutions)

        return best_solution

    cut = cut_by_DAG

    def init_max_match_forward_tokenizer(self):
        if self.max_match_forward_tokenizer is None:
            self.max_match_forward_tokenizer = MaxMatchForwardTokenizer()
            self.max_match_forward_tokenizer.load_model()

    def cut_by_max_match_forward(self, message):
        self.init_max_match_forward_tokenizer()
        return self.max_match_forward_tokenizer.segment(message)

    def init_max_match_backward_tokenizer(self):
        if self.max_match_backward_tokenizer is None:
            self.max_match_backward_tokenizer = MaxMatchBackwardTokenizer()
            self.max_match_backward_tokenizer.load_model()

    def cut_by_max_match_backward(self, message):
        self.init_max_match_backward_tokenizer()
        return self.max_match_backward_tokenizer.segment(message)

    def init_max_match_bidirectional_tokenizer(self):
        if self.max_match_bidirectional_tokenizer is None:
            self.max_match_bidirectional_tokenizer = MaxMatchBidirectionalTokenizer(
            )
            self.max_match_bidirectional_tokenizer.load_model()

    def cut_by_max_match_bidirectional(self, message):
        self.init_max_match_bidirectional_tokenizer()
        return self.max_match_bidirectional_tokenizer.segment(message)

    def init_crf_tokenizer(self):
        if self.crf_tokenizer is None:
            self.crf_tokenizer = CRFTokenizer()
            self.crf_tokenizer.load_model()

    def cut_by_CRF(self, message):
        self.init_crf_tokenizer()
        return self.crf_tokenizer.segment(message)

    def load_custom_dict(self, dict_file):
        # TODO: not implement yet
        pass

    def add_word(self, word, freq=None):
        # TODO: not implement yet
        pass

    def del_word(self, word):
        # TODO: not implement yet
        pass

    def load_user_dict(self, dict_file):
        return self.dag_tokenizer.dict_data.load_user_dict(dict_file)

    @property
    def mini_log_freq(self):
        # TODO: not implement yet
        pass

    @property
    def average_log_freq(self):
        # TODO: not implement yet
        pass
예제 #10
0
class MaxMatchBidirectionalTokenizer(BaseDictionaryBasedTokenizer):
    def __init__(self, *args, **kwargs):
        super(MaxMatchBidirectionalTokenizer, self).__init__(*args, **kwargs)

        self.forward_tokenizer = None
        self.backward_tokenizer = None

    def do_train(self):
        super(MaxMatchBidirectionalTokenizer, self).do_train()

        dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data)

        reverse_dict_data = TrieAlgorithm(raw_dict_data=self.raw_dict_data,
                                          reverse=True)

        self.forward_tokenizer = MaxMatchForwardTokenizer(dict_data=dict_data)

        self.backward_tokenizer = MaxMatchBackwardTokenizer(
            dict_data=reverse_dict_data)

    def load_model(self):
        super(MaxMatchBidirectionalTokenizer, self).load_model()

        self.forward_tokenizer = MaxMatchForwardTokenizer(self.model_dir)
        self.forward_tokenizer.load_model()

        self.backward_tokenizer = MaxMatchBackwardTokenizer(self.model_dir)
        self.backward_tokenizer.load_model()

    def segment(self, message):
        forward_token = self.forward_tokenizer.segment(message)
        backward_token = self.backward_tokenizer.segment(message)

        token_result = [forward_token, backward_token]

        token_count = operator.le(*map(self.compute_token_count, token_result))

        token_granularity = operator.ge(
            *map(self.compute_token_granularity, token_result))

        token_len_variability = operator.le(
            *map(self.compute_token_len_variability, token_result))

        if token_count + token_granularity + token_len_variability >= 2:
            return forward_token
        else:
            return backward_token

    @staticmethod
    def compute_token_granularity(token_list):
        return sum(map(lambda x: len(x), token_list)) / len(token_list)

    @staticmethod
    def compute_token_oov_rate(token_list):
        # FIXME: method is_oov() is not exits yet
        return sum(map(lambda x: x.is_oov, token_list)) / len(token_list)

    @staticmethod
    def compute_token_count(token_list):
        return len(token_list)

    @staticmethod
    def compute_token_len_variability(token_list):
        mean_length = sum(map(lambda x: len(x), token_list)) / len(token_list)
        return sum(map(lambda x: abs(len(x) - mean_length)**2,
                       token_list)) / len(token_list)

    def get_loader(self):
        return BidirectionalDictionaryBasedLoader

    def assign_from_loader(self, *args, **kwargs):
        self.forward_tokenizer = kwargs['forward_tokenizer']
        self.backward_tokenizer = kwargs['backward_tokenizer']