def my_cws_corpus(): data_root = test_data_path() corpus_path = os.path.join(data_root, 'my_cws_corpus.txt') if not os.path.isfile(corpus_path): with open(corpus_path, 'w') as out: out.write('''商品 和 服务 商品 和服 物美价廉 服务 和 货币''') return corpus_path
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-07-24 22:04 # 《自然语言处理入门》8.4.1 基于角色标注的中国人名识别 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import * from tests.book.ch03.ngram_segment import DijkstraSegment from tests.book.ch07 import pku from tests.test_utility import test_data_path EasyDictionary = JClass('com.hankcs.hanlp.corpus.dictionary.EasyDictionary') NRDictionaryMaker = JClass( 'com.hankcs.hanlp.corpus.dictionary.NRDictionaryMaker') Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence') MODEL = test_data_path() + "/nr" def demoNR(): HanLP.Config.enableDebug() segment = DijkstraSegment() print(segment.seg("王国维和服务员")) def train_one_sent(): dictionary = EasyDictionary.create(HanLP.Config.CoreDictionaryPath) # 核心词典 maker = NRDictionaryMaker(dictionary) # 训练模块 maker.verbose = True # 调试输出 maker.learn([Sentence.create("这里/r 有/v 关天培/nr 的/u 有关/vn 事迹/n 。/w") ]) # 学习一个句子 maker.saveTxtTo(MODEL) # 输出HMM到txt
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-07-01 19:15 # 《自然语言处理入门》6.4 HanLP 中的 CRF++ API # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import * from pyhanlp.static import HANLP_JAR_PATH from tests.book.ch03.demo_corpus_loader import my_cws_corpus from tests.test_utility import test_data_path CRFSegmenter = JClass('com.hankcs.hanlp.model.crf.CRFSegmenter') TXT_CORPUS_PATH = my_cws_corpus() TSV_CORPUS_PATH = TXT_CORPUS_PATH + ".tsv" TEMPLATE_PATH = test_data_path() + "/cws-template.txt" CRF_MODEL_PATH = test_data_path() + "/crf-cws-model" CRF_MODEL_TXT_PATH = test_data_path() + "/crf-cws-model.txt" def train_or_load(corpus_txt_path=TXT_CORPUS_PATH, model_txt_path=CRF_MODEL_TXT_PATH): if os.path.isfile(model_txt_path): # 已训练,直接加载 segmenter = CRFSegmenter(model_txt_path) return segmenter else: segmenter = CRFSegmenter() # 创建空白分词器 segmenter.convertCorpus(corpus_txt_path, TSV_CORPUS_PATH) # 执行转换 segmenter.dumpTemplate(TEMPLATE_PATH) # 导出特征模板 # 交给CRF++训练 print("语料已转换为 %s ,特征模板已导出为 %s" % (TSV_CORPUS_PATH, TEMPLATE_PATH))
break wordnet.add(i, Vertex.newPunctuationInstance(sent[i - 1: j - 1])) # 填充[i, j)之间的空白行 i = j else: i += len(vertexes[i][-1].realWord) return wordnet def viterbi(wordnet): nodes = wordnet.getVertexes() # 前向遍历 for i in range(0, len(nodes) - 1): for node in nodes[i]: for to in nodes[i + len(node.realWord)]: to.updateFrom(node) # 根据距离公式计算节点距离,并维护最短路径上的前驱指针from # 后向回溯 path = [] # 最短路径 f = nodes[len(nodes) - 1].getFirst() # 从终点回溯 while f: path.insert(0, f) f = f.getFrom() # 按前驱指针from回溯 return [v.realWord for v in path] if __name__ == '__main__': corpus_path = my_cws_corpus() model_path = os.path.join(test_data_path(), 'my_cws_model') train_bigram(corpus_path, model_path) load_bigram(model_path)
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-06-08 15:35 # 3.2.2 微软亚洲研究院语料库 MSR # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ import os from tests.test_utility import ensure_data, test_data_path sighan05 = ensure_data( 'icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip') msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8') msr_train = os.path.join(sighan05, 'training', 'msr_training.utf8') msr_model = os.path.join(test_data_path(), 'msr_cws') msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8') msr_output = os.path.join(sighan05, 'testing', 'msr_bigram_output.txt') msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')
# 《自然语言处理入门》13.3 word2vec # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import * from tests.book.ch03.msr import msr_train from tests.test_utility import test_data_path IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil') DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel') Word2VecTrainer = JClass('com.hankcs.hanlp.mining.word2vec.Word2VecTrainer') WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel') # 演示词向量的训练与应用 TRAIN_FILE_NAME = msr_train MODEL_FILE_NAME = os.path.join(test_data_path(), "word2vec.txt") def print_nearest(word, model): print( "\n Word " "Cosine\n------------------------------------------------------------------------" ) for entry in model.nearest(word): print("%50s\t\t%f" % (entry.getKey(), entry.getValue())) def print_nearest_document(document, documents, model): print_header(document) for entry in model.nearest(document): print("%50s\t\t%f" % (documents[entry.getKey()], entry.getValue()))