Esempio n. 1
0
 def load_user_dict(self, path_user=path_dict_user, type_user="******"):
     """
         加载用户词典
     :param path_user:str, like '/home/user.dict' 
     :return: None
     """
     if not os.path.exists(path_user):
         raise RuntimeError("your path_user is not exist!")
     if type_user == "json":
         self.dict_user = load_json(path_user)[0]  # 加载json字典文件
         for k, v in self.dict_user.items():
             if k not in self.dict_words_freq:
                 self.dict_words_freq[k] = v  # 更新到总字典, words_freq
             else:
                 self.dict_words_freq[
                     k] = self.dict_words_freq[k] + v  # 更新到总字典, words_freq
         self.num_words = sum(self.dict_words_freq.values())
     elif type_user == "txt":
         words_all = txt_read(path_user)
         for word_freq in words_all:
             wf = word_freq.split(" ")  # 空格' '区分带不带词频的情况
             if len(wf) == 2:
                 word = wf[0]
                 freq = wf[1]
             else:
                 word = wf[0]
                 freq = 132
             if word not in self.dict_words_freq:
                 self.dict_words_freq[word] = freq  # 更新到总字典, words_freq
             else:
                 self.dict_words_freq[word] = self.dict_words_freq[
                     word] + freq  # 更新到总字典, words_freq
         self.num_words = sum(self.dict_words_freq.values())
     elif type_user == "csv":
         words_all = txt_read(path_user)
         for word_freq in words_all:
             wf = word_freq.split(",")  # 逗号','区分带不带词频的情况
             if len(wf) == 2:
                 word = wf[0]
                 freq = wf[1]
             else:
                 word = wf[0]
                 freq = 132
             if word not in self.dict_words_freq:
                 self.dict_words_freq[word] = freq  # 更新到总字典, words_freq
             else:
                 self.dict_words_freq[word] = self.dict_words_freq[
                     word] + freq  # 更新到总字典, words_freq
         self.num_words = sum(self.dict_words_freq.values())
     else:
         raise EOFError
Esempio n. 2
0
def evulate_file(path_file):
    """
	    验证切词的各种指标
	:param path_file: str, like '/train.txt'
	:return: float
	"""
    # 读取数据
    sents = txt_read(path_file)
    # 初始化统计计数
    count_macropodus = 0
    count_real = 0
    count_true = 0
    count = 0
    # 切词与统计, true
    for sent in sents:
        sent_sp = sent.strip()
        res_real = sent_sp.split(' ')
        sentence = sent_sp.replace(' ', '')
        res_macropodus = macropodus.cut(sentence)
        print(res_macropodus)
        count += 1
        count_real += len(res_real)
        count_macropodus += len(res_macropodus)
        for cm in res_macropodus:
            if cm in res_real:
                count_true += 1
                res_real.remove(cm)
    # precision, recall, f1
    precision = count_true / count_macropodus
    recall = count_true / count_real
    f1 = (precision * recall * 2) / (precision + recall)

    return precision, recall, f1
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time    : 2019/12/21 23:11
# @author  : Mo
# @function:

from macropodus.preprocess.tools_common import load_json, save_json
from macropodus.preprocess.tools_common import txt_write, txt_read
import json

pku_training = txt_read("pku_training.utf8")
file = open("pku_train.json", "w", encoding="utf-8")
pku_ = []
for pku in pku_training:
    pkus = pku.split("  ")
    label_pkus = ""
    for pku_sig in pkus:
        len_pku = len(pku_sig)
        if len_pku == 1:
            label_pkus += "S"
        elif len_pku == 2:
            label_pkus += "BE"
        else:
            label_pkus += "B" + "M" * (len_pku - 2) + "E"
    label_pkus_l = list(label_pkus)
    pku_res = {}
    pku_res["question"] = list("".join(pkus))
    pku_res["label"] = label_pkus_l
    p_json = json.dumps(pku_res, ensure_ascii=False)
    file.write(p_json + "\n")
#     pku_.append(pku_res)
Esempio n. 4
0

if __name__ == '__main__':
    sd = SegDAG()
    sd.add_word(str('知识图谱'))

    # for i in range(50000):
    sd_enum = sd.cut(sentence='apple_pir大漠帝国我再也找不到了')
    print(list(sd_enum))

    # 测试性能
    from macropodus.preprocess.tools_common import txt_read, txt_write
    from macropodus.conf.path_config import path_root
    import time
    path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
    sentences = txt_read(path_wordseg_a)

    time_start = time.time()
    count = 0
    for i in range(10000):
        for sen in sentences:
            # print("原句:"+sen)
            count += 1
            res = sd.cut(sen)
            # print(list(res))
    time_end = time.time()
    print(time_end-time_start)
    print(count/(time_end - time_start))

while True:
    print("请输入:")
Esempio n. 5
0
# @author  : Mo
# @function:

tags_res = [
    'm', 'vn', 'v', 'Yg', 'Tg', 'l', 'p', 'nt', 'y', 'Rg', 'e', 'i', 'an', 'q',
    'k', 'nr', 'Ag', 'n', 'vvn', 'd', 'f', 'ad', 'vd', 'z', 'Mg', 'nx', 'a',
    'h', 's', 'u', 'na', 'Bg', 'j', 'w', 'Ng', 'o', 'nz', 'ns', 'b', 'Vg',
    'Dg', 'r', 't', 'c'
]
# ['Rg', 'nt', 'Ng', 'm', 'u', 'nx', 'an', 'na', 'b', 'd', 'c', 'vd', 'j', 'ns', 'ad', 's', 'z', 'Mg', 'vn', 'l', 't', 'f', 'v', 'vvn', 'n', 'r', 'Tg', 'Dg', 'Bg', 'i', 'nr', 'k', 'q', 'o', 'a', 'w', 'e', 'h', 'p', 'y', 'nz', 'Ag', 'Yg', 'Vg']

tags_res = [tr.upper() for tr in tags_res]

from macropodus.preprocess.tools_common import txt_read

tag_jiagus = txt_read("data/tag_jiagu.txt")
tag_jiebas = txt_read("data/tag_jieba.txt")

tgu = []
for tag_jiagu in tag_jiagus:
    tags = tag_jiagu.split("\u3000")
    tag = tags[0].strip()
    tgu.append(tag.upper())

tga = []
for tag_jieba in tag_jiebas:
    tags = tag_jieba.split("\t")
    tag = tags[0].strip()
    tga.append(tag.upper())

tgus = []