def my_cws_corpus(): data_root = test_data_path() corpus_path = os.path.join(data_root, 'my_cws_corpus.txt') if not os.path.isfile(corpus_path): with open(corpus_path, 'w') as out: out.write('''商品 和 服务 商品 和服 物美价廉 服务 和 货币''') return corpus_path
wordnet.add(i, Vertex.newPunctuationInstance( sent[i - 1:j - 1])) # 填充[i, j)之间的空白行 i = j else: i += len(vertexes[i][-1].realWord) return wordnet def viterbi(wordnet): nodes = wordnet.getVertexes() # 前向遍历 for i in range(0, len(nodes) - 1): for node in nodes[i]: for to in nodes[i + len(node.realWord)]: to.updateFrom(node) # 根据距离公式计算节点距离,并维护最短路径上的前驱指针from # 后向回溯 path = [] # 最短路径 f = nodes[len(nodes) - 1].getFirst() # 从终点回溯 while f: path.insert(0, f) f = f.getFrom() # 按前驱指针from回溯 return [v.realWord for v in path] if __name__ == '__main__': corpus_path = my_cws_corpus() model_path = os.path.join(test_data_path(), 'my_cws_model') train_bigram(corpus_path, model_path) load_bigram(model_path)
# 《自然语言处理入门》8.4.3 基于角色标注的机构名识别 import os import sys # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from pyhanlp import * from ngram_segment import DijkstraSegment import pku from test_utility import test_data_path EasyDictionary = JClass('com.hankcs.hanlp.corpus.dictionary.EasyDictionary') NTDictionaryMaker = JClass( 'com.hankcs.hanlp.corpus.dictionary.NTDictionaryMaker') Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence') MODEL = test_data_path() + "/ns" def train(corpus, model): dictionary = EasyDictionary.create(HanLP.Config.CoreDictionaryPath) # 核心词典 maker = NTDictionaryMaker(dictionary) # 训练模块 maker.train(corpus) # 在语料库上训练 maker.saveTxtTo(model) # 输出HMM到txt def load(model): HanLP.Config.PlaceDictionaryPath = model + ".txt" # data/test/ns.txt HanLP.Config.PlaceDictionaryTrPath = model + ".tr.txt" # data/test/ns.tr.txt segment = DijkstraSegment().enableOrganizationRecognize( True).enableCustomDictionary(False) # 该分词器便于调试 return segment
# -*- coding:utf-8 -*-# Author:hankcs# Date: 2018-06-08 15:35 # 3.2.2 微软亚洲研究院语料库 MSR import os from test_utility import ensure_data, test_data_path sighan05 = ensure_data( 'icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip') msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8') msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8') msr_model = os.path.join(test_data_path(), 'msr_cws') msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8') msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt') msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')
# David 2020.6.18 测试未通过:Ambiguous overloads found for constructor import sys import os # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from pyhanlp import * from book.ch03.E_331_demo_corpus_loader import my_cws_corpus from test_utility import test_data_path CRFLexicalAnalyzer = JClass('com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer') TXT_CORPUS_PATH = my_cws_corpus() TSV_CORPUS_PATH = TXT_CORPUS_PATH + ".tsv" TEMPLATE_PATH = test_data_path() + "/cws-template.txt" CRF_MODEL_PATH = test_data_path() + "/crf-cws-model" CRF_MODEL_TXT_PATH = test_data_path() + "/crf-cws-model.txt" def train_or_load(corpus_txt_path=TXT_CORPUS_PATH, model_txt_path=CRF_MODEL_TXT_PATH): if os.path.isfile(model_txt_path): # 已训练,直接加载 segmenter = CRFLexicalAnalyzer(model_txt_path) return segmenter else: segmenter = CRFLexicalAnalyzer(None) # 创建空白分词器 segmenter.convertCorpus(corpus_txt_path, TSV_CORPUS_PATH) # 执行转换 segmenter.dumpTemplate(TEMPLATE_PATH) # 导出特征模板 # 交给CRF++训练 print("语料已转换为 %s ,特征模板已导出为 %s" % (TSV_CORPUS_PATH, TEMPLATE_PATH))