def named_entity_recognition(self, sent, standard_name=False): """ 利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名)三种实体。harvesttext会预先链接已知实体 :param sent: :param standard_name: :return: 发现的命名实体信息,字典 {实体名: 实体类型} """ from pyhanlp import HanLP, JClass if not self.hanlp_prepared: self.hanlp_prepare() self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer") StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True) entity_type_dict = {} try: for x in StandardTokenizer.segment(sent2): # 三种前缀代表:人名(nr),地名(ns),机构名(nt) tag0 = str(x.nature) if tag0.startswith("nr"): entity_type_dict[x.word] = "人名" elif tag0.startswith("ns"): entity_type_dict[x.word] = "地名" elif tag0.startswith("nt"): entity_type_dict[x.word] = "机构名" elif tag0.startswith("nz"): entity_type_dict[x.word] = "其他专名" except: pass return entity_type_dict
def __init__(self): self.hanlp = JClass('com.hankcs.hanlp.HanLP') self.jump_relation = set(['定中关系', '状中结构', '主谓关系']) self.reverse_relation = set(['动补结构', '动宾关系', '介宾关系']) self.main_relation = set(['核心关系']) self.remove_relate = set(['标点符号']) self.include = set() self.group = {}
def divisionTrainData(trainDataPath, classificationPath): # 创建类别目录 positivePath = os.path.join(classificationPath, 'positive') negetivePath = os.path.join(classificationPath, 'negetive') if not os.path.isdir(classificationPath): os.mkdir(classificationPath) if not os.path.isdir(positivePath): os.mkdir(positivePath) if not os.path.isdir(negetivePath): os.mkdir(negetivePath) # 将文本内容按照label分成两个类目并保存在不同文件夹 with open(trainDataPath, 'r', encoding='utf-8') as fin: fin.readline() for sentence in fin.readlines(): sentence = sentence.strip('\n') sentence = sentence.split('\t') if (sentence[2] == '0'): if() pf = open(os.path.join(positivePath, sentence[0] + '.txt'), 'a+', encoding='utf-8') pf.write(sentence[1]) pf.close() else: nf = open(os.path.join(negetivePath, sentence[0] + '.txt'), 'a+', encoding='utf-8') nf.write(sentence[1]) nf.close() print('成功加载训练集。') ########################################################################################## # 载入分类器 IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 载入分词器 ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer') HanLPTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer') BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') ########################################################################################## if __name__ == '__main__': divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH) classifier = NaiveBayesClassifier() classifier.train(CLASSIFICATION_DATA_PATH) print(classifier.classify("我去挂机了"))
def set_user_dicts(tokenizer, path_user_dicts): from pyhanlp import JClass CustomDictionary = JClass( "com.hankcs.hanlp.dictionary.CustomDictionary") if len(path_user_dicts) > 0: for path_user_dict in path_user_dicts: logger.info("Loading Hanlp User Dictionary at " + str(path_user_dict)) with open(path_user_dict, 'r', encoding='utf-8') as f: for word in f.readlines(): CustomDictionary.add(word.strip()) # 动态增加 else: logger.info("No Hanlp User Dictionary found") return tokenizer
def hanlp_prepare(self): from pyhanlp import HanLP, JClass CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary") StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") self.hanlp_prepared = True for type0 in self.entity_types: tag0 = "n" if "人名" in type0: tag0 = "nr" elif "地名" in type0: tag0 = "ns" elif "机构" in type0: tag0 = "nt" elif "其他专名" in type0: tag0 = "nz" CustomDictionary.insert(type0, "%s 1000" % (tag0)) # 动态增加 StandardTokenizer.ANALYZER.enableCustomDictionaryForcing(True)
def named_entity_recognition(self, sent, standard_name=False, return_posseg=False): '''利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名,其他专名)实体。harvesttext会预先链接已知实体 :param sent: string, 文本 :param standard_name: bool, 是否把连接到的已登录转化为标准名 :param return_posseg: bool, 是否返回包括命名实体识别的,带词性分词结果 :param book: bool, 预先识别 :return: entity_type_dict: 发现的命名实体信息,字典 {实体名: 实体类型} (return_posseg=True时) possegs: list of (单词, 词性) ''' from pyhanlp import HanLP, JClass if not self.hanlp_prepared: self.hanlp_prepare() self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer") StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True) entity_type_dict = {} try: possegs = [] for x in StandardTokenizer.segment(sent2): # 三种前缀代表:人名(nr),地名(ns),机构名(nt) tag0 = str(x.nature) if tag0.startswith("nr"): entity_type_dict[x.word] = "人名" elif tag0.startswith("ns"): entity_type_dict[x.word] = "地名" elif tag0.startswith("nt"): entity_type_dict[x.word] = "机构名" elif tag0.startswith("nz"): entity_type_dict[x.word] = "其他专名" possegs.append((x.word, tag0)) except: pass if return_posseg: return entity_type_dict, possegs else: return entity_type_dict
download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/liblinear-1.95.jar') ########################################################################################## # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') # 载入分词器 BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') ########################################################################################## if __name__ == '__main__': divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH) classifier = LinearSVMClassifier() classifier.train(CLASSIFICATION_DATA_PATH) # 保存模型 model = classifier.getmodel() IOUtil.saveObjectTo(model, os.path.join()) print(classifier.classify("我去挂机了"))
def install_jar(name, filepath, url): dst = os.path.join(filepath, name) if os.path.isfile(dst): return dst download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/liblinear-1.95.jar') ########################################################################################## # 载入分词器 BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet') MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet') Evaluator = JClass( 'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator') ##########################################################################################
########################################################################################## def endDeleteDataSet(dataSetPath): if os.path.isdir(dataSetPath): os.remove(dataSetPath) print('已删除分类后的训练集。') else: print('训练集未分类。') ########################################################################################## # 载入分类器 IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 载入分词器 ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer') HanLPTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer') BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') ########################################################################################## if __name__ == '__main__':
def install_jar(name, filepath, url): dst = os.path.join(filepath, name) if os.path.isfile(dst): return dst download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/liblinear-1.95.jar') ########################################################################################## # 载入分词器 BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet') MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet') # 载入模型或新建模型 def train_or_load_classifier(): model_path = SAVE_MODEL_PATH if os.path.isfile(model_path):
class DependencyExtraction(object): def __init__(self): self.hanlp = JClass('com.hankcs.hanlp.HanLP') self.jump_relation = set(['定中关系', '状中结构', '主谓关系']) self.reverse_relation = set(['动补结构', '动宾关系', '介宾关系']) self.main_relation = set(['核心关系']) self.remove_relate = set(['标点符号']) self.include = set() self.group = {} # 句子的观点提取,单root,从root出发,1.找前面最近的修饰词。2.找后面距离为1的reverse_relation def parseSentence(self, sentence): reverse_target = {} parse_result = str( self.hanlp.parseDependency(sentence)).strip().split('\n') for p in parse_result: print(p) for i in range(len(parse_result)): parse_result[i] = parse_result[i].split('\t') self_index = int(parse_result[i][0]) target_index = int(parse_result[i][6]) relation = parse_result[i][7] if relation in self.remove_relate: continue if target_index > self_index: reverse_target[target_index] = self_index result = {} checked = set() related_words = set() for item in parse_result: relation = item[7] target = int(item[6]) index = int(item[0]) if index in checked: continue while relation in self.jump_relation: checked.add(index) next_item = parse_result[target - 1] relation = next_item[7] target = int(next_item[6]) index = int(next_item[0]) if relation in self.reverse_relation and target in result and target not in related_words: result[index] = parse_result[index - 1][1] if index in reverse_target: reverse_target_index = reverse_target[index] if abs(index - reverse_target[index]) <= 1: result[reverse_target_index] = parse_result[ reverse_target_index - 1][1] related_words.add(reverse_target_index) if relation in self.main_relation: result[index] = parse_result[index - 1][1] if index in reverse_target: reverse_target_index = reverse_target[index] if abs(index - reverse_target_index) <= 1: result[reverse_target_index] = parse_result[ reverse_target_index - 1][1] related_words.add(reverse_target_index) checked.add(index) for item in parse_result: word = item[1] if word in self.include: result[int(item[0])] = word sorted_keys = sorted(result.items(), key=operator.itemgetter(0)) selected_words = [w[1] for w in sorted_keys] return selected_words ''' 关键词观点提取,根据关键词key,找到关键处的rootpath,寻找这个root中的观点,观点提取方式和parseSentence的基本一样。 支持提取多个root的观点。 ''' def parseSentWithKey(self, sentence, key=None): if key: keyIndex = 0 if key not in sentence: return [] rootList = [] parse_result = str( self.hanlp.parseDependency(sentence)).strip().split('\n') # 索引-1,改正确 for i in range(len(parse_result)): parse_result[i] = parse_result[i].split('\t') parse_result[i][0] = int(parse_result[i][0]) - 1 parse_result[i][6] = int(parse_result[i][6]) - 1 if key and parse_result[i][1] == key: keyIndex = i for i in range(len(parse_result)): self_index = int(parse_result[i][0]) target_index = int(parse_result[i][6]) relation = parse_result[i][7] if relation in self.main_relation: if self_index not in rootList: rootList.append(self_index) elif relation == "并列关系" and target_index in rootList: if self_index not in rootList: rootList.append(self_index) if len(parse_result[target_index]) == 10: parse_result[target_index].append([]) if target_index != -1 and not (relation == "并列关系" and target_index in rootList): parse_result[target_index][10].append(self_index) if key: rootIndex = 0 if len(rootList) > 1: target = keyIndex while True: if target in rootList: rootIndex = rootList.index(target) break next_item = parse_result[target] target = int(next_item[6]) loopRoot = [rootList[rootIndex]] else: loopRoot = rootList result = {} related_words = set() for root in loopRoot: if key: self.addToResult(parse_result, keyIndex, result, related_words) self.addToResult(parse_result, root, result, related_words) for item in parse_result: relation = item[7] target = int(item[6]) index = int(item[0]) if relation in self.reverse_relation and target in result and target not in related_words: self.addToResult(parse_result, index, result, related_words) for item in parse_result: word = item[1] if word == key: result[int(item[0])] = word sorted_keys = sorted(result.items(), key=operator.itemgetter(0)) selected_words = [w[1] for w in sorted_keys] return selected_words def addToResult(self, parse_result, index, result, related_words): result[index] = parse_result[index][1] if len(parse_result[index]) == 10: return reverse_target_index = 0 for i in parse_result[index][10]: if i < index and i > reverse_target_index: reverse_target_index = i if abs(index - reverse_target_index) <= 1: result[reverse_target_index] = parse_result[reverse_target_index][ 1] related_words.add(reverse_target_index)
def hanlp_cut(text): tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") return " ".join([term.word for term in tokenizer.segment(text)])
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-01-04 20:28 # 《自然语言处理入门》11.6 标准化评测 # 配套书籍:http://nlp.hankcs.com/book.php # 讨论答疑:https://bbs.hankcs.com/ from pyhanlp import JClass from tests.demos.demo_text_classification import sogou_corpus_path IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') LinearSVMClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet') IDataSet = JClass('com.hankcs.hanlp.classification.corpus.IDataSet') MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet') Evaluator = JClass( 'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator') FMeasure = JClass( 'com.hankcs.hanlp.classification.statistics.evaluations.FMeasure') BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') HanLPTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer') ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer') def evaluate(classifier, tokenizer): training_corpus = FileDataSet().setTokenizer(tokenizer).load( sogou_corpus_path, "UTF-8", 0.9)
for i in range(1, len(seg_list)): bigram = seg_list[i - 1] + '@' + seg_list[i] if bigram in self.condition_data: self.condition_data[bigram] += 1 else: self.condition_data[bigram] = 1 def write(self, path): with open(path, 'w', encoding='utf8') as f: for key, frequency in sorted(self.condition_data.items(), key=lambda x: x[1], reverse=True): string = key + ' ' + str(frequency) + '\n' f.write(string) if __name__ == '__main__': p = Process() NShortSegment = JClass("com.hankcs.hanlp.seg.NShort.NShortSegment") nshort_segment = NShortSegment().enableCustomDictionary( False).enablePlaceRecognize(True).enableOrganizationRecognize(True) with open('data/test_data', encoding='utf8') as f: for line in f: line = re.sub('\s', '', line) if line: res = nshort_segment.seg(line) res = list(map(lambda x: re.sub('/.*', '', str(x)), res)) print(res) p.process(res) p.write('./data/self.ngram.txt')
import re from pyhanlp import HanLP, JClass PerceptronLexicalAnalyzer = JClass( 'com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer') analyzer = PerceptronLexicalAnalyzer() clean_re = [ re.compile(r'via(.*?)$'), re.compile(r'([^(]+)$'), re.compile(r'「[^「]+」$'), re.compile(r'\([^\(]+\)$') ] def clean(string): for regex in clean_re: string = regex.sub('', string) string = string.replace('(转)', '')\ .replace('「转」', '')\ .replace('图转', '')\ .replace('(转', '')\ .replace('(转', '')\ .replace('【全文】', '')\ .replace('9GAG', '') return string
if os.path.exists(dest_path): return dest_path if data_url.endswith('.zip'): dest_path += '.zip' download(data_url, dest_path) if data_url.endswith('.zip'): with zipfile.ZipFile(dest_path, "r") as archive: archive.extractall(root_path) remove_file(dest_path) dest_path = dest_path[:-len('.zip')] return dest_path ######################################################################################################################## IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') # 中文情感挖掘语料-ChnSentiCorp 谭松波 chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://file.hankcs.com/corpus/ChnSentiCorp.zip") def predict(classifier, text): print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text))) if __name__ == '__main__': classifier = NaiveBayesClassifier()
import re from http.server import BaseHTTPRequestHandler, HTTPServer from urllib.parse import parse_qs, urlparse, quote from pyhanlp import HanLP, JClass from pyhanlp.static import INDEX_HTML SENTENCE = 'sentence' TEMPLATE = 'Error' HANLP_GOOGLE_UA = 'UA-XXXXX-X' ENVIRON = os.environ.copy() if "HANLP_GOOGLE_UA" in ENVIRON: HANLP_GOOGLE_UA = ENVIRON["HANLP_GOOGLE_UA"] with open(INDEX_HTML, encoding='utf-8') as src: TEMPLATE = src.read() lexical_analyzer = JClass('com.hankcs.hanlp.tokenizer.NLPTokenizer').ANALYZER class S(BaseHTTPRequestHandler): def _set_headers(self): self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() def write(self, text: str): self.wfile.write(text.encode()) def do_GET(self): params = parse_qs(urlparse(self.path).query) self._set_headers() # {'text': ['I looove iparser!']}