Ejemplo n.º 1
0
def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for word, value, start, end in words:
        if pre_start != start:
            sents.append(text[pre_start:start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words
Ejemplo n.º 2
0
    def split_sents(text: str, trie: Trie):
        words = trie.parse_longest(text)

        # https://github.com/hankcs/HanLP/blob/master/tests/demo/zh/demo_cws_trie.py
        # 对官方提供的自定义字典的改造,以处理以下问题:
        # “千米/时”应划分为完整的,而不是再划分出“米/时”
        keys_rm_list = []
        for i, key_i in enumerate(words):
            for j, key_j in enumerate(words):
                if (i != j) and (key_i[3]
                                 == key_j[3]) and (key_i[2] < key_j[2]):
                    keys_rm_list.append((key_j[2], key_j[3]))
                elif (i != j) and (key_i[3]
                                   == key_j[3]) and (key_i[2] > key_j[2]):
                    keys_rm_list.append((key_i[2], key_i[3]))
                elif (i != j) and (key_i[2]
                                   == key_j[2]) and (key_i[3] < key_j[3]):
                    keys_rm_list.append((key_i[2], key_i[3]))
                elif (i != j) and (key_i[2]
                                   == key_j[2]) and (key_i[3] > key_j[3]):
                    keys_rm_list.append((key_j[2], key_j[3]))

        words = list(filter(lambda x: (x[2], x[3]) not in keys_rm_list, words))

        sents = []
        pre_start = 0
        offsets = []
        for word, value, start, end in words:
            if pre_start != start:
                sents.append(text[pre_start:start])
                offsets.append(pre_start)
            pre_start = end
        if pre_start != len(text):
            sents.append(text[pre_start:])
            offsets.append(pre_start)
        return sents, offsets, words
Ejemplo n.º 3
0
class hanLPTokenizer:
    def __init__(self,
                 hanlp_tokenizer,
                 hanlp_tagger,
                 user_dict_path,
                 stop_words_path,
                 consider_tags_path,
                 ignore_tag='-'):
        self.hanlp_tokenizer = hanlp_tokenizer
        self.tagger = hanlp_tagger
        self.ignore_tag = ignore_tag
        self.stop_words = self.load_stop_words(stop_words_path)
        self.considered_tags = self.load_consider_tags(consider_tags_path)
        self.user_dict = self.load_user_dict(user_dict_path)
        self.trie = Trie()
        self.trie.update(self.user_dict)
        self.tokenizer = hanlp.pipeline() \
            .append(self.split_sentences, output_key=('parts', 'offsets', 'words')) \
            .append(self.hanlp_tokenizer, input_key='parts', output_key='tokens') \
            .append(self.merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')

    def split_sentences(self, text: str):
        words = self.trie.parse_longest(text)
        sentences = []
        pre_start = 0
        offsets = []
        for word, value, start, end in words:
            if pre_start != start:
                sentences.append(text[pre_start:start])
                offsets.append(pre_start)
            pre_start = end
        if pre_start != len(text):
            sentences.append(text[pre_start:])
            offsets.append(pre_start)
        return sentences, offsets, words

    @staticmethod
    def merge_parts(parts, offsets, words):
        items = [(i, p) for (i, p) in zip(offsets, parts)]
        items += [(start, [word]) for (word, value, start, end) in words]
        # In case you need the tag, use the following line instead
        # items += [(start, [(word, value)]) for (word, value, start, end) in words]
        return [each for x in sorted(items) for each in x[1]]

    def tokenize(self, text):
        """

        :param text: str
        :return:
        """
        return self.tokenizer(text)['merged']

    def tag(self, tokens):
        """

        :param tokens: list
        :return:
        """
        return self.tagger(tokens)

    def tag_stop_words(self, tokens, tags):
        new_tags = []
        for i in range(len(tokens)):
            if tokens[i] in self.stop_words:
                new_tags.append(self.ignore_tag)
            else:
                new_tags.append(tags[i])
        return new_tags

    def tag_unconsidered_tags(self, tags):
        new_tags = []
        for tag in tags:
            if tag.lower() in self.considered_tags:
                new_tags.append(tag)
            else:
                new_tags.append(self.ignore_tag)
        return new_tags

    def tokenize_filter(self, text):
        tokens = self.tokenize(text)
        tags = self.tag(tokens)
        tags = self.tag_stop_words(tokens, tags)  # remove stop words
        tags = self.tag_unconsidered_tags(tags)  # tag filter
        tagged_tokens = []
        for i in range(len(tags)):
            tagged_tokens.append((tokens[i], tags[i]))
        return tagged_tokens

    @staticmethod
    def load_txt_data(path, mode='utf-8-sig', origin=False):
        """
        This func is used to reading txt file
        :param origin:
        :param path: path where file stored
        :param mode:
        :type path: str
        :return: string lines in file in a list
        :rtype: list
        """
        if type(path) != str:
            raise TypeError
        res = []

        file = open(path, 'rb')
        lines = file.read().decode(mode, errors='ignore')
        for line in lines.split('\n'):
            line = line.strip()
            if origin:
                res.append(line)
            else:
                if line:
                    res.append(line)
        file.close()
        return res

    def load_user_dict(self, path):
        raw = self.load_txt_data(path)
        user_word_dict = {}
        for i in range(len(raw)):
            word = raw[i].split('\t')[0]
            if word not in user_word_dict:
                user_word_dict[word] = ' '
        return user_word_dict

    def load_stop_words(self, path):
        return set(self.load_txt_data(path) + stopwords.words('english'))

    def load_consider_tags(self, path):
        return set(
            [x.split('\t')[0].lower() for x in self.load_txt_data(path)])
Ejemplo n.º 4
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-11 11:08
from hanlp.common.trie import Trie

trie = Trie({'密码', '码'})
print(trie.parse_longest('密码设置'))