def train_or_load_classifier(): corpus_path = ensure_data('搜狗文本分类语料库迷你版', 'http://hanlp.linrunsoft.com/release/corpus/sogou-text-classification-corpus-mini.zip') model_path = corpus_path + '.ser' if os.path.isfile(model_path): return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path)) classifier = NaiveBayesClassifier() classifier.train(corpus_path) model = classifier.getModel() IOUtil.saveObjectTo(model, model_path) return model
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-06-07 18:37 # 《自然语言处理入门》3.6.1 日语分词语料 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ import os from tests.book.ch03.ngram_segment import train_bigram, load_bigram from tests.test_utility import ensure_data jp_corpus = ensure_data('jpcorpus', 'http://file.hankcs.com/corpus/jpcorpus.zip') jp_bigram = os.path.join(jp_corpus, 'jp_bigram') jp_corpus = os.path.join(jp_corpus, 'ja_gsd-ud-train.txt') if __name__ == '__main__': train_bigram(jp_corpus, jp_bigram) # 训练 segment = load_bigram(jp_bigram, verbose=False) # 加载 print(segment.seg('自然言語処理入門という本が面白いぞ!')) # 日语分词
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-07-30 21:03 # 《自然语言处理入门》9.1 新词提取 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import * from tests.test_utility import ensure_data HLM_PATH = ensure_data("红楼梦.txt", "http://file.hankcs.com/corpus/红楼梦.zip") XYJ_PATH = ensure_data("西游记.txt", "http://file.hankcs.com/corpus/西游记.zip") SHZ_PATH = ensure_data("水浒传.txt", "http://file.hankcs.com/corpus/水浒传.zip") SAN_PATH = ensure_data("三国演义.txt", "http://file.hankcs.com/corpus/三国演义.zip") WEIBO_PATH = ensure_data( "weibo-classification", "http://file.hankcs.com/corpus/weibo-classification.zip") def test_weibo(): for folder in os.listdir(WEIBO_PATH): print(folder) big_text = "" for file in os.listdir(os.path.join(WEIBO_PATH, folder)): with open(os.path.join(WEIBO_PATH, folder, file), encoding='utf-8') as src: big_text += "".join(src.readlines()) word_info_list = HanLP.extractWords(big_text, 100) print(word_info_list) def extract(corpus):
for (start, end) in A & B: word = text[start:end] if dic.containsKey(word): IV_R += 1 else: OOV_R += 1 p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100 return p, r, 2 * p * r / (p + r), OOV_R / OOV * 100, IV_R / IV * 100 if __name__ == '__main__': print(to_region('商品 和 服务')) sighan05 = ensure_data( 'icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip') msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8') msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8') msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt') msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8') DoubleArrayTrieSegment = JClass( 'com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment') segment = DoubleArrayTrieSegment([msr_dict ]).enablePartOfSpeechTagging(True) with open(msr_gold, encoding='utf-8') as test, open(msr_output, 'w', encoding='utf-8') as output: for line in test:
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-01-07 13:53 from pyhanlp import * from tests.test_utility import ensure_data IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') # 中文情感挖掘语料-ChnSentiCorp 谭松波 chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://hanlp.linrunsoft.com/release/corpus/ChnSentiCorp.zip") def predict(classifier, text): print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text))) if __name__ == '__main__': classifier = NaiveBayesClassifier() # 创建分类器,更高级的功能请参考IClassifier的接口定义 classifier.train(chn_senti_corp) # 训练后的模型支持持久化,下次就不必训练了 predict(classifier, "前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!") predict(classifier, "结果大失所望,灯光昏暗,空间极其狭小,床垫质量恶劣,房间还伴着一股霉味。") predict(classifier, "可利用文本分类实现情感分析,效果不是不行")
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-05-23 17:26 import os from pyhanlp import SafeJClass from tests.test_utility import ensure_data NaiveBayesClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') sogou_corpus_path = ensure_data( '搜狗文本分类语料库迷你版', 'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip') def train_or_load_classifier(): model_path = sogou_corpus_path + '.ser' if os.path.isfile(model_path): return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path)) classifier = NaiveBayesClassifier() classifier.train(sogou_corpus_path) model = classifier.getModel() IOUtil.saveObjectTo(model, model_path) return NaiveBayesClassifier(model) def predict(classifier, text): print("《%16s》\t属于分类\t【%s】" % (text, classifier.classify(text))) # 如需获取离散型随机变量的分布,请使用predict接口 # print("《%16s》\t属于分类\t【%s】" % (text, classifier.predict(text)))
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-01-07 13:53 from pyhanlp import * from tests.test_utility import ensure_data IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') # 中文情感挖掘语料-ChnSentiCorp 谭松波 chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://file.hankcs.com/corpus/ChnSentiCorp.zip") def predict(classifier, text): print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text))) if __name__ == '__main__': classifier = NaiveBayesClassifier() # 创建分类器,更高级的功能请参考IClassifier的接口定义 classifier.train(chn_senti_corp) # 训练后的模型支持持久化,下次就不必训练了 predict(classifier, "前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!") predict(classifier, "结果大失所望,灯光昏暗,空间极其狭小,床垫质量恶劣,房间还伴着一股霉味。") predict(classifier, "可利用文本分类实现情感分析,效果不是不行")
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-04-28 10:07 from pyhanlp import * from tests.test_utility import ensure_data WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel') DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel') model_path = os.path.join( ensure_data( 'hanlp-wiki-vec-zh', 'http://hanlp.linrunsoft.com/release/model/hanlp-wiki-vec-zh.zip'), 'hanlp-wiki-vec-zh.txt') word2vec = WordVectorModel(model_path) doc2vec = DocVectorModel(word2vec) docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"] for idx, doc in enumerate(docs): doc2vec.addDocument(idx, doc) print(word2vec.nearest('语言')) for res in doc2vec.nearest('我要看比赛'): print('%s = %.2f' % (docs[res.getKey().intValue()], res.getValue().floatValue()))
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-04-28 10:07 from pyhanlp import * from tests.test_utility import ensure_data WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel') DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel') model_path = os.path.join( ensure_data('hanlp-wiki-vec-zh', 'http://file.hankcs.com/model/hanlp-wiki-vec-zh.zip'), 'hanlp-wiki-vec-zh.txt') word2vec = WordVectorModel(model_path) doc2vec = DocVectorModel(word2vec) docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"] for idx, doc in enumerate(docs): doc2vec.addDocument(idx, doc) print(word2vec.nearest('语言')) for res in doc2vec.nearest('我要看比赛'): print('%s = %.2f' % (docs[res.getKey().intValue()], res.getValue().floatValue()))
# Date: 2018-07-29 23:24 # 《自然语言处理入门》8.6 自定义领域命名实体识别 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ import os import sys # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from tests.book.ch05.perceptron_cws import CWSTrainer from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter from tests.book.ch07.demo_perceptron_pos import PerceptronPOSTagger from tests.book.ch08.demo_sp_ner import NERTrainer, os, PerceptronNERecognizer from tests.test_utility import ensure_data PLANE_ROOT = ensure_data("plane-re", "http://file.hankcs.com/corpus/plane-re.zip") PLANE_CORPUS = os.path.join(PLANE_ROOT, 'train.txt') PLANE_MODEL = os.path.join(PLANE_ROOT, 'model.bin') if __name__ == '__main__': trainer = NERTrainer() trainer.tagSet.nerLabels.clear() # 不识别nr、ns、nt trainer.tagSet.nerLabels.add("np") # 目标是识别np recognizer = PerceptronNERecognizer( trainer.train(PLANE_CORPUS, PLANE_MODEL).getModel()) # 在NER预测前,需要一个分词器,最好训练自同源语料库 CWS_MODEL = CWSTrainer().train(PLANE_CORPUS, PLANE_MODEL.replace('model.bin', 'cws.bin')).getModel() analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(CWS_MODEL), PerceptronPOSTagger(), recognizer)
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-07-06 13:54 # 《自然语言处理入门》7.4.2 标注语料 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter from tests.book.ch07.demo_perceptron_pos import train_perceptron_pos from tests.test_utility import ensure_data ZHUXIAN = ensure_data( "zhuxian", "http://file.hankcs.com/corpus/zhuxian.zip") + "/train.txt" posTagger = train_perceptron_pos(ZHUXIAN) # 训练 analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), posTagger) # 包装 print(analyzer.analyze("陆雪琪的天琊神剑不做丝毫退避,直冲而上,瞬间,这两道奇光异宝撞到了一起。")) # 分词+标注
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-06-21 19:46 # 《自然语言处理入门》5.3 基于感知机的人名性别分类 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import * from tests.test_utility import ensure_data PerceptronNameGenderClassifier = JClass( 'com.hankcs.hanlp.model.perceptron.PerceptronNameGenderClassifier') cnname = ensure_data('cnname', 'http://file.hankcs.com/corpus/cnname.zip') TRAINING_SET = os.path.join(cnname, 'train.csv') TESTING_SET = os.path.join(cnname, 'test.csv') MODEL = cnname + ".bin" def run_classifier(averaged_perceptron): print('=====%s=====' % ('平均感知机算法' if averaged_perceptron else '朴素感知机算法')) classifier = PerceptronNameGenderClassifier() print('训练集准确率:', classifier.train(TRAINING_SET, 10, averaged_perceptron)) model = classifier.getModel() print('特征数量:', len(model.parameter)) # model.save(MODEL, model.featureMap.entrySet(), 0, True) # classifier = PerceptronNameGenderClassifier(MODEL) for name in "赵建军", "沈雁冰", "陆雪琪", "李冰冰": print('%s=%s' % (name, classifier.predict(name))) print('测试集准确率:', classifier.evaluate(TESTING_SET))
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-05-23 17:26 import os from pyhanlp import SafeJClass from tests.test_utility import ensure_data NaiveBayesClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') sogou_corpus_path = ensure_data( '搜狗文本分类语料库迷你版', 'http://hanlp.linrunsoft.com/release/corpus/sogou-text-classification-corpus-mini.zip' ) def train_or_load_classifier(): model_path = sogou_corpus_path + '.ser' if os.path.isfile(model_path): return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path)) classifier = NaiveBayesClassifier() classifier.train(sogou_corpus_path) model = classifier.getModel() IOUtil.saveObjectTo(model, model_path) return NaiveBayesClassifier(model) def predict(classifier, text): print("《%16s》\t属于分类\t【%s】" % (text, classifier.classify(text))) # 如需获取离散型随机变量的分布,请使用predict接口
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-05-23 17:26 import os from pyhanlp import SafeJClass from tests.test_utility import ensure_data # 实例化朴素贝叶斯分类器实例 NaiveBayesClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') # 实例化文件IO实例 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') # 自己的微博训练语料(这里如果Python是用HanLP安装的,请将训练文件夹放在下边的路径下 # C:\ProgramData\Anaconda3\Lib\site-packages\pyhanlp\static\data\test sentiment_corpus_path = ensure_data('新浪微博/train', '') def train_or_load_classifier(): # 朴素贝叶斯模型文件名 model_path = sentiment_corpus_path + '.ser' # 检查模型文件是否存在,如果存在则加载模型并返回朴素贝叶斯分类器对象 if os.path.isfile(model_path): return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path)) # 模型文件不存在,则首先构建朴素贝叶斯分类器实例 classifier = NaiveBayesClassifier() # 传入训练文件路径名称进行训练 classifier.train(sentiment_corpus_path) # 获取训练后得到的模型 model = classifier.getModel() # 保存模型为模型文件
# -*- coding:utf-8 -*- # Author:hankcs # Date: 2018-07-04 17:41 # 《自然语言处理入门》7.2.1 《人民日报》语料库与 PKU 标注集 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ import os from tests.test_utility import ensure_data PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip") PKU199801 = os.path.join(PKU98, '199801.txt') PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt') PKU199801_TEST = os.path.join(PKU98, '199801-test.txt') POS_MODEL = os.path.join(PKU98, 'pos.bin') NER_MODEL = os.path.join(PKU98, 'ner.bin')
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-02-11 23:18 # 《自然语言处理入门》12.5.1 训练模型 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import * from tests.test_utility import ensure_data KBeamArcEagerDependencyParser = JClass( 'com.hankcs.hanlp.dependency.perceptron.parser.KBeamArcEagerDependencyParser' ) CTB_ROOT = ensure_data("ctb8.0-dep", "http://file.hankcs.com/corpus/ctb8.0-dep.zip") CTB_TRAIN = CTB_ROOT + "/train.conll" CTB_DEV = CTB_ROOT + "/dev.conll" CTB_TEST = CTB_ROOT + "/test.conll" CTB_MODEL = CTB_ROOT + "/ctb.bin" BROWN_CLUSTER = ensure_data( "wiki-cn-cluster.txt", "http://file.hankcs.com/corpus/wiki-cn-cluster.zip") if __name__ == '__main__': parser = KBeamArcEagerDependencyParser.train(CTB_TRAIN, CTB_DEV, BROWN_CLUSTER, CTB_MODEL) print(parser.parse("人吃鱼")) score = parser.evaluate(CTB_TEST) print("UAS=%.1f LAS=%.1f\n" % (score[0], score[1]))