Esempio n. 1
0
def tokenizer_pku():
    f = open('dict.json', 'r', encoding='utf-8')
    dict = json.load(f)
    f.close()
    trie = Trie()
    trie.update(dict)
    print(type(trie))
    text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
    print(split_sents(text, trie))
    tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
    tokenizer = hanlp.pipeline() \
        .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
        .append(tokenizer, input_key='parts', output_key='tokens') \
        .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
    print(tokenizer(text))
Esempio n. 2
0
 def create_trie():
     trie = Trie()
     trie['自然'] = 'nature'
     trie['自然人'] = 'human'
     trie['自然语言'] = 'language'
     trie['自语'] = 'talk	to oneself'
     trie['入门'] = 'introduction'
     return trie
 def __init__(self,
              hanlp_tokenizer,
              hanlp_tagger,
              user_dict_path,
              stop_words_path,
              consider_tags_path,
              ignore_tag='-'):
     self.hanlp_tokenizer = hanlp_tokenizer
     self.tagger = hanlp_tagger
     self.ignore_tag = ignore_tag
     self.stop_words = self.load_stop_words(stop_words_path)
     self.considered_tags = self.load_consider_tags(consider_tags_path)
     self.user_dict = self.load_user_dict(user_dict_path)
     self.trie = Trie()
     self.trie.update(self.user_dict)
     self.tokenizer = hanlp.pipeline() \
         .append(self.split_sentences, output_key=('parts', 'offsets', 'words')) \
         .append(self.hanlp_tokenizer, input_key='parts', output_key='tokens') \
         .append(self.merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')
Esempio n. 4
0
def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for word, value, start, end in words:
        if pre_start != start:
            sents.append(text[pre_start:start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words
Esempio n. 5
0
    def split_sents(text: str, trie: Trie):
        words = trie.parse_longest(text)

        # https://github.com/hankcs/HanLP/blob/master/tests/demo/zh/demo_cws_trie.py
        # 对官方提供的自定义字典的改造,以处理以下问题:
        # “千米/时”应划分为完整的,而不是再划分出“米/时”
        keys_rm_list = []
        for i, key_i in enumerate(words):
            for j, key_j in enumerate(words):
                if (i != j) and (key_i[3]
                                 == key_j[3]) and (key_i[2] < key_j[2]):
                    keys_rm_list.append((key_j[2], key_j[3]))
                elif (i != j) and (key_i[3]
                                   == key_j[3]) and (key_i[2] > key_j[2]):
                    keys_rm_list.append((key_i[2], key_i[3]))
                elif (i != j) and (key_i[2]
                                   == key_j[2]) and (key_i[3] < key_j[3]):
                    keys_rm_list.append((key_i[2], key_i[3]))
                elif (i != j) and (key_i[2]
                                   == key_j[2]) and (key_i[3] > key_j[3]):
                    keys_rm_list.append((key_j[2], key_j[3]))

        words = list(filter(lambda x: (x[2], x[3]) not in keys_rm_list, words))

        sents = []
        pre_start = 0
        offsets = []
        for word, value, start, end in words:
            if pre_start != start:
                sents.append(text[pre_start:start])
                offsets.append(pre_start)
            pre_start = end
        if pre_start != len(text):
            sents.append(text[pre_start:])
            offsets.append(pre_start)
        return sents, offsets, words
class hanLPTokenizer:
    def __init__(self,
                 hanlp_tokenizer,
                 hanlp_tagger,
                 user_dict_path,
                 stop_words_path,
                 consider_tags_path,
                 ignore_tag='-'):
        self.hanlp_tokenizer = hanlp_tokenizer
        self.tagger = hanlp_tagger
        self.ignore_tag = ignore_tag
        self.stop_words = self.load_stop_words(stop_words_path)
        self.considered_tags = self.load_consider_tags(consider_tags_path)
        self.user_dict = self.load_user_dict(user_dict_path)
        self.trie = Trie()
        self.trie.update(self.user_dict)
        self.tokenizer = hanlp.pipeline() \
            .append(self.split_sentences, output_key=('parts', 'offsets', 'words')) \
            .append(self.hanlp_tokenizer, input_key='parts', output_key='tokens') \
            .append(self.merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')

    def split_sentences(self, text: str):
        words = self.trie.parse_longest(text)
        sentences = []
        pre_start = 0
        offsets = []
        for word, value, start, end in words:
            if pre_start != start:
                sentences.append(text[pre_start:start])
                offsets.append(pre_start)
            pre_start = end
        if pre_start != len(text):
            sentences.append(text[pre_start:])
            offsets.append(pre_start)
        return sentences, offsets, words

    @staticmethod
    def merge_parts(parts, offsets, words):
        items = [(i, p) for (i, p) in zip(offsets, parts)]
        items += [(start, [word]) for (word, value, start, end) in words]
        # In case you need the tag, use the following line instead
        # items += [(start, [(word, value)]) for (word, value, start, end) in words]
        return [each for x in sorted(items) for each in x[1]]

    def tokenize(self, text):
        """

        :param text: str
        :return:
        """
        return self.tokenizer(text)['merged']

    def tag(self, tokens):
        """

        :param tokens: list
        :return:
        """
        return self.tagger(tokens)

    def tag_stop_words(self, tokens, tags):
        new_tags = []
        for i in range(len(tokens)):
            if tokens[i] in self.stop_words:
                new_tags.append(self.ignore_tag)
            else:
                new_tags.append(tags[i])
        return new_tags

    def tag_unconsidered_tags(self, tags):
        new_tags = []
        for tag in tags:
            if tag.lower() in self.considered_tags:
                new_tags.append(tag)
            else:
                new_tags.append(self.ignore_tag)
        return new_tags

    def tokenize_filter(self, text):
        tokens = self.tokenize(text)
        tags = self.tag(tokens)
        tags = self.tag_stop_words(tokens, tags)  # remove stop words
        tags = self.tag_unconsidered_tags(tags)  # tag filter
        tagged_tokens = []
        for i in range(len(tags)):
            tagged_tokens.append((tokens[i], tags[i]))
        return tagged_tokens

    @staticmethod
    def load_txt_data(path, mode='utf-8-sig', origin=False):
        """
        This func is used to reading txt file
        :param origin:
        :param path: path where file stored
        :param mode:
        :type path: str
        :return: string lines in file in a list
        :rtype: list
        """
        if type(path) != str:
            raise TypeError
        res = []

        file = open(path, 'rb')
        lines = file.read().decode(mode, errors='ignore')
        for line in lines.split('\n'):
            line = line.strip()
            if origin:
                res.append(line)
            else:
                if line:
                    res.append(line)
        file.close()
        return res

    def load_user_dict(self, path):
        raw = self.load_txt_data(path)
        user_word_dict = {}
        for i in range(len(raw)):
            word = raw[i].split('\t')[0]
            if word not in user_word_dict:
                user_word_dict[word] = ' '
        return user_word_dict

    def load_stop_words(self, path):
        return set(self.load_txt_data(path) + stopwords.words('english'))

    def load_consider_tags(self, path):
        return set(
            [x.split('\t')[0].lower() for x in self.load_txt_data(path)])
Esempio n. 7
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
from hanlp.common.trie import Trie

import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

trie = Trie()
trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'})


def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for word, value, start, end in words:
        if pre_start != start:
            sents.append(text[pre_start:start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words

Esempio n. 8
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-11 11:08
from hanlp.common.trie import Trie

trie = Trie({'密码', '码'})
print(trie.parse_longest('密码设置'))
Esempio n. 9
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
from hanlp.common.trie import Trie

import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

trie = Trie()
trie.update({'自定义': 'custom', '词典': 'dict', '聪明人': 'smart'})


def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for word, value, start, end in words:
        if pre_start != start:
            sents.append(text[pre_start: start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words

Esempio n. 10
0
    # pipeline = hanlp.pipeline() \
    #     .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    #     .append(tokenizer, output_key='tokens') \
    #     .append(tagger, output_key='part_of_speech_tags') \
    #     .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies') \
    #     .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies')

    # 添加自定义字典,也就是上面的num_keys_list + units_list
    custrom_dict = {}
    keys_list = num_keys_list + units_list

    keys_list.sort(key=lambda x: len(x), reverse=True)
    for key in keys_list:
        custrom_dict[key] = key

    trie = Trie()
    # trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'})
    trie.update(custrom_dict)

    def split_sents(text: str, trie: Trie):
        words = trie.parse_longest(text)

        # https://github.com/hankcs/HanLP/blob/master/tests/demo/zh/demo_cws_trie.py
        # 对官方提供的自定义字典的改造,以处理以下问题:
        # “千米/时”应划分为完整的,而不是再划分出“米/时”
        keys_rm_list = []
        for i, key_i in enumerate(words):
            for j, key_j in enumerate(words):
                if (i != j) and (key_i[3]
                                 == key_j[3]) and (key_i[2] < key_j[2]):
                    keys_rm_list.append((key_j[2], key_j[3]))