Example #1
0
    def __init__(self,
                 model_name="default",
                 user_dict="default",
                 postag=False):
        """初始化函数,加载模型及用户词典"""
        # print("loading model")
        # config = Config()
        # self.config = config
        self.postag = postag
        if model_name in ["default"]:
            config.modelDir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "models",
                model_name,
            )
        elif model_name in config.available_models:
            config.modelDir = os.path.join(
                config.pkuseg_home,
                model_name,
            )
            download_model(config.model_urls[model_name], config.pkuseg_home,
                           config.model_hash[model_name])
        else:
            config.modelDir = model_name
        # config.fModel = os.path.join(config.modelDir, "model.txt")
        if user_dict is None:
            file_name = None
            other_names = None
        else:
            if user_dict not in config.available_models:
                file_name = user_dict
            else:
                file_name = None
            if model_name in config.models_with_dict:
                other_name = os.path.join(
                    config.pkuseg_home,
                    model_name,
                    model_name + "_dict.pkl",
                )
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [other_name, default_name]
            else:
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [default_name]

        self.preprocesser = Preprocesser(file_name)
        # self.preprocesser = Preprocesser([])
        self.postprocesser = Postprocesser(None, other_names)

        self.feature_extractor = FeatureExtractor.load()
        self.model = Model.load()

        self.idx_to_tag = {
            idx: tag
            for tag, idx in self.feature_extractor.tag_to_idx.items()
        }

        self.n_feature = len(self.feature_extractor.feature_to_idx)
        self.n_tag = len(self.feature_extractor.tag_to_idx)

        if postag:
            download_model(config.model_urls["postag"], config.pkuseg_home,
                           config.model_hash[model_name])
            postag_dir = os.path.join(
                config.pkuseg_home,
                "postag",
            )
            self.tagger = Postag(postag_dir)
Example #2
0
class pkuseg:
    def __init__(self,
                 model_name="default",
                 user_dict="default",
                 postag=False):
        """初始化函数,加载模型及用户词典"""
        # print("loading model")
        # config = Config()
        # self.config = config
        self.postag = postag
        if model_name in ["default"]:
            config.modelDir = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "models",
                model_name,
            )
        elif model_name in config.available_models:
            config.modelDir = os.path.join(
                config.pkuseg_home,
                model_name,
            )
            download_model(config.model_urls[model_name], config.pkuseg_home,
                           config.model_hash[model_name])
        else:
            config.modelDir = model_name
        # config.fModel = os.path.join(config.modelDir, "model.txt")
        if user_dict is None:
            file_name = None
            other_names = None
        else:
            if user_dict not in config.available_models:
                file_name = user_dict
            else:
                file_name = None
            if model_name in config.models_with_dict:
                other_name = os.path.join(
                    config.pkuseg_home,
                    model_name,
                    model_name + "_dict.pkl",
                )
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [other_name, default_name]
            else:
                default_name = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "dicts",
                    "default.pkl",
                )
                other_names = [default_name]

        self.preprocesser = Preprocesser(file_name)
        # self.preprocesser = Preprocesser([])
        self.postprocesser = Postprocesser(None, other_names)

        self.feature_extractor = FeatureExtractor.load()
        self.model = Model.load()

        self.idx_to_tag = {
            idx: tag
            for tag, idx in self.feature_extractor.tag_to_idx.items()
        }

        self.n_feature = len(self.feature_extractor.feature_to_idx)
        self.n_tag = len(self.feature_extractor.tag_to_idx)

        if postag:
            download_model(config.model_urls["postag"], config.pkuseg_home,
                           config.model_hash[model_name])
            postag_dir = os.path.join(
                config.pkuseg_home,
                "postag",
            )
            self.tagger = Postag(postag_dir)

        # print("finish")

    def _cut(self, text):
        """
        直接对文本分词
        """

        examples = list(self.feature_extractor.normalize_text(text))
        length = len(examples)

        all_feature = []  # type: List[List[int]]
        for idx in range(length):
            node_feature_idx = self.feature_extractor.get_node_features_idx(
                idx, examples)
            # node_feature = self.feature_extractor.get_node_features(
            #     idx, examples
            # )

            # node_feature_idx = []
            # for feature in node_feature:
            #     feature_idx = self.feature_extractor.feature_to_idx.get(feature)
            #     if feature_idx is not None:
            #         node_feature_idx.append(feature_idx)
            # if not node_feature_idx:
            #     node_feature_idx.append(0)

            all_feature.append(node_feature_idx)

        _, tags = _inf.decodeViterbi_fast(all_feature, self.model)

        words = []
        current_word = None
        is_start = True
        for tag, char in zip(tags, text):
            if is_start:
                current_word = char
                is_start = False
            elif "B" in self.idx_to_tag[tag]:
                words.append(current_word)
                current_word = char
            else:
                current_word += char
        if current_word:
            words.append(current_word)

        return words

    def cut(self, txt):
        """分词,结果返回一个list"""

        txt = txt.strip()

        ret = []

        if not txt:
            return ret

        imary = txt.split()  # 根据空格分为多个片段

        # 对每个片段分词
        for w0 in imary:
            if not w0:
                continue

            # 根据用户词典拆成更多片段
            lst, isword = self.preprocesser.solve(w0)

            for w, isw in zip(lst, isword):
                if isw:
                    ret.append(w)
                    continue

                output = self._cut(w)
                ret.extend(self.postprocesser(output))

        if self.postag:
            tags = self.tagger.tag(ret)
            ret = list(zip(ret, tags))
        return ret