Beispiel #1
0
 def named_entity_recognition(self, sent, standard_name=False):
     """
     利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名)三种实体。harvesttext会预先链接已知实体
     :param sent:
     :param standard_name:
     :return: 发现的命名实体信息,字典 {实体名: 实体类型}
     """
     from pyhanlp import HanLP, JClass
     if not self.hanlp_prepared:
         self.hanlp_prepare()
     self.standard_name = standard_name
     entities_info = self.entity_linking(sent)
     sent2 = self.decoref(sent, entities_info)
     StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer")
     StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True)
     entity_type_dict = {}
     try:
         for x in StandardTokenizer.segment(sent2):
             # 三种前缀代表:人名(nr),地名(ns),机构名(nt)
             tag0 = str(x.nature)
             if tag0.startswith("nr"):
                 entity_type_dict[x.word] = "人名"
             elif tag0.startswith("ns"):
                 entity_type_dict[x.word] = "地名"
             elif tag0.startswith("nt"):
                 entity_type_dict[x.word] = "机构名"
             elif tag0.startswith("nz"):
                 entity_type_dict[x.word] = "其他专名"
     except:
         pass
     return entity_type_dict
 def __init__(self):
     self.hanlp = JClass('com.hankcs.hanlp.HanLP')
     self.jump_relation = set(['定中关系', '状中结构', '主谓关系'])
     self.reverse_relation = set(['动补结构', '动宾关系', '介宾关系'])
     self.main_relation = set(['核心关系'])
     self.remove_relate = set(['标点符号'])
     self.include = set()
     self.group = {}
Beispiel #3
0
def divisionTrainData(trainDataPath, classificationPath):
    # 创建类别目录
    positivePath = os.path.join(classificationPath, 'positive')
    negetivePath = os.path.join(classificationPath, 'negetive')
    if not os.path.isdir(classificationPath):
        os.mkdir(classificationPath)
    if not os.path.isdir(positivePath):
        os.mkdir(positivePath)
    if not os.path.isdir(negetivePath):
        os.mkdir(negetivePath)

    # 将文本内容按照label分成两个类目并保存在不同文件夹
    with open(trainDataPath, 'r', encoding='utf-8') as fin:
        fin.readline()
        for sentence in fin.readlines():
            sentence = sentence.strip('\n')
            sentence = sentence.split('\t')
            if (sentence[2] == '0'):
                if()
                pf = open(os.path.join(positivePath, sentence[0] + '.txt'),
                          'a+',
                          encoding='utf-8')
                pf.write(sentence[1])
                pf.close()
            else:
                nf = open(os.path.join(negetivePath, sentence[0] + '.txt'),
                          'a+',
                          encoding='utf-8')
                nf.write(sentence[1])
                nf.close()
    print('成功加载训练集。')


##########################################################################################
# 载入分类器
IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 载入分词器
ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer')
HanLPTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer')
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

##########################################################################################

if __name__ == '__main__':
    divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH)
    classifier = NaiveBayesClassifier()
    classifier.train(CLASSIFICATION_DATA_PATH)
    print(classifier.classify("我去挂机了"))
Beispiel #4
0
    def set_user_dicts(tokenizer, path_user_dicts):
        from pyhanlp import JClass
        CustomDictionary = JClass(
            "com.hankcs.hanlp.dictionary.CustomDictionary")

        if len(path_user_dicts) > 0:
            for path_user_dict in path_user_dicts:
                logger.info("Loading Hanlp User Dictionary at " +
                            str(path_user_dict))
                with open(path_user_dict, 'r', encoding='utf-8') as f:
                    for word in f.readlines():
                        CustomDictionary.add(word.strip())  # 动态增加
        else:
            logger.info("No Hanlp User Dictionary found")

        return tokenizer
Beispiel #5
0
    def hanlp_prepare(self):
        from pyhanlp import HanLP, JClass
        CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
        StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")

        self.hanlp_prepared = True
        for type0 in self.entity_types:
            tag0 = "n"
            if "人名" in type0:
                tag0 = "nr"
            elif "地名" in type0:
                tag0 = "ns"
            elif "机构" in type0:
                tag0 = "nt"
            elif "其他专名" in type0:
                tag0 = "nz"
            CustomDictionary.insert(type0, "%s 1000" % (tag0))  # 动态增加
        StandardTokenizer.ANALYZER.enableCustomDictionaryForcing(True)
Beispiel #6
0
    def named_entity_recognition(self, sent, standard_name=False, return_posseg=False):
        '''利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名,其他专名)实体。harvesttext会预先链接已知实体

        :param sent: string, 文本
        :param standard_name: bool, 是否把连接到的已登录转化为标准名
        :param return_posseg: bool, 是否返回包括命名实体识别的,带词性分词结果
        :param book: bool, 预先识别
        :return: entity_type_dict: 发现的命名实体信息,字典 {实体名: 实体类型}
            (return_posseg=True时) possegs: list of (单词, 词性)
        '''
        from pyhanlp import HanLP, JClass
        if not self.hanlp_prepared:
            self.hanlp_prepare()
        self.standard_name = standard_name
        entities_info = self.entity_linking(sent)
        sent2 = self.decoref(sent, entities_info)
        StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer")
        StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True)
        entity_type_dict = {}
        try:
            possegs = []
            for x in StandardTokenizer.segment(sent2):
                # 三种前缀代表:人名(nr),地名(ns),机构名(nt)
                tag0 = str(x.nature)
                if tag0.startswith("nr"):
                    entity_type_dict[x.word] = "人名"
                elif tag0.startswith("ns"):
                    entity_type_dict[x.word] = "地名"
                elif tag0.startswith("nt"):
                    entity_type_dict[x.word] = "机构名"
                elif tag0.startswith("nz"):
                    entity_type_dict[x.word] = "其他专名"
                possegs.append((x.word, tag0))
        except:
            pass
        if return_posseg:
            return entity_type_dict, possegs
        else:
            return entity_type_dict
Beispiel #7
0
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/liblinear-1.95.jar')
##########################################################################################

# 载入分类器
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')
# 保存模型的工具
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')

# 载入分词器
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

##########################################################################################

if __name__ == '__main__':
    divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH)
    classifier = LinearSVMClassifier()
    classifier.train(CLASSIFICATION_DATA_PATH)
    # 保存模型
    model = classifier.getmodel()
    IOUtil.saveObjectTo(model, os.path.join())
    print(classifier.classify("我去挂机了"))
Beispiel #8
0
def install_jar(name, filepath, url):
    dst = os.path.join(filepath, name)
    if os.path.isfile(dst):
        return dst
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/liblinear-1.95.jar')
##########################################################################################

# 载入分词器
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

# 载入分类器
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 保存模型的工具
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')

FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')
Evaluator = JClass(
    'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator')


##########################################################################################
Beispiel #9
0

##########################################################################################


def endDeleteDataSet(dataSetPath):
    if os.path.isdir(dataSetPath):
        os.remove(dataSetPath)
        print('已删除分类后的训练集。')
    else:
        print('训练集未分类。')


##########################################################################################
# 载入分类器
IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 载入分词器
ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer')
HanLPTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer')
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

##########################################################################################

if __name__ == '__main__':
Beispiel #10
0
def install_jar(name, filepath, url):
    dst = os.path.join(filepath, name)
    if os.path.isfile(dst):
        return dst
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', PROJECT_PATH,
            'http://file.hankcs.com/bin/liblinear-1.95.jar')
##########################################################################################

# 载入分词器
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')

# 载入分类器
LinearSVMClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')

# 保存模型的工具
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')

FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')

# 载入模型或新建模型
def train_or_load_classifier():
    model_path = SAVE_MODEL_PATH
    if os.path.isfile(model_path):
class DependencyExtraction(object):
    def __init__(self):
        self.hanlp = JClass('com.hankcs.hanlp.HanLP')
        self.jump_relation = set(['定中关系', '状中结构', '主谓关系'])
        self.reverse_relation = set(['动补结构', '动宾关系', '介宾关系'])
        self.main_relation = set(['核心关系'])
        self.remove_relate = set(['标点符号'])
        self.include = set()
        self.group = {}

    # 句子的观点提取,单root,从root出发,1.找前面最近的修饰词。2.找后面距离为1的reverse_relation
    def parseSentence(self, sentence):
        reverse_target = {}
        parse_result = str(
            self.hanlp.parseDependency(sentence)).strip().split('\n')
        for p in parse_result:
            print(p)
        for i in range(len(parse_result)):
            parse_result[i] = parse_result[i].split('\t')
            self_index = int(parse_result[i][0])
            target_index = int(parse_result[i][6])
            relation = parse_result[i][7]
            if relation in self.remove_relate:
                continue
            if target_index > self_index:
                reverse_target[target_index] = self_index
        result = {}
        checked = set()
        related_words = set()
        for item in parse_result:
            relation = item[7]
            target = int(item[6])
            index = int(item[0])
            if index in checked:
                continue
            while relation in self.jump_relation:
                checked.add(index)
                next_item = parse_result[target - 1]
                relation = next_item[7]
                target = int(next_item[6])
                index = int(next_item[0])

            if relation in self.reverse_relation and target in result and target not in related_words:
                result[index] = parse_result[index - 1][1]
                if index in reverse_target:
                    reverse_target_index = reverse_target[index]
                    if abs(index - reverse_target[index]) <= 1:
                        result[reverse_target_index] = parse_result[
                            reverse_target_index - 1][1]
                        related_words.add(reverse_target_index)

            if relation in self.main_relation:
                result[index] = parse_result[index - 1][1]
                if index in reverse_target:
                    reverse_target_index = reverse_target[index]
                    if abs(index - reverse_target_index) <= 1:
                        result[reverse_target_index] = parse_result[
                            reverse_target_index - 1][1]
                        related_words.add(reverse_target_index)
            checked.add(index)

        for item in parse_result:
            word = item[1]
            if word in self.include:
                result[int(item[0])] = word

        sorted_keys = sorted(result.items(), key=operator.itemgetter(0))
        selected_words = [w[1] for w in sorted_keys]
        return selected_words

    ''' 
    关键词观点提取,根据关键词key,找到关键处的rootpath,寻找这个root中的观点,观点提取方式和parseSentence的基本一样。
    支持提取多个root的观点。
    '''

    def parseSentWithKey(self, sentence, key=None):
        if key:
            keyIndex = 0
            if key not in sentence:
                return []
        rootList = []
        parse_result = str(
            self.hanlp.parseDependency(sentence)).strip().split('\n')
        # 索引-1,改正确
        for i in range(len(parse_result)):
            parse_result[i] = parse_result[i].split('\t')
            parse_result[i][0] = int(parse_result[i][0]) - 1
            parse_result[i][6] = int(parse_result[i][6]) - 1
            if key and parse_result[i][1] == key:
                keyIndex = i

        for i in range(len(parse_result)):
            self_index = int(parse_result[i][0])
            target_index = int(parse_result[i][6])
            relation = parse_result[i][7]
            if relation in self.main_relation:
                if self_index not in rootList:
                    rootList.append(self_index)
            elif relation == "并列关系" and target_index in rootList:
                if self_index not in rootList:
                    rootList.append(self_index)

            if len(parse_result[target_index]) == 10:
                parse_result[target_index].append([])

            if target_index != -1 and not (relation == "并列关系"
                                           and target_index in rootList):
                parse_result[target_index][10].append(self_index)

        if key:
            rootIndex = 0
            if len(rootList) > 1:
                target = keyIndex
                while True:
                    if target in rootList:
                        rootIndex = rootList.index(target)
                        break
                    next_item = parse_result[target]
                    target = int(next_item[6])
            loopRoot = [rootList[rootIndex]]
        else:
            loopRoot = rootList

        result = {}
        related_words = set()
        for root in loopRoot:
            if key:
                self.addToResult(parse_result, keyIndex, result, related_words)
            self.addToResult(parse_result, root, result, related_words)

        for item in parse_result:
            relation = item[7]
            target = int(item[6])
            index = int(item[0])
            if relation in self.reverse_relation and target in result and target not in related_words:
                self.addToResult(parse_result, index, result, related_words)

        for item in parse_result:
            word = item[1]
            if word == key:
                result[int(item[0])] = word

        sorted_keys = sorted(result.items(), key=operator.itemgetter(0))
        selected_words = [w[1] for w in sorted_keys]
        return selected_words

    def addToResult(self, parse_result, index, result, related_words):
        result[index] = parse_result[index][1]
        if len(parse_result[index]) == 10:
            return
        reverse_target_index = 0
        for i in parse_result[index][10]:
            if i < index and i > reverse_target_index:
                reverse_target_index = i
        if abs(index - reverse_target_index) <= 1:
            result[reverse_target_index] = parse_result[reverse_target_index][
                1]
            related_words.add(reverse_target_index)
Beispiel #12
0
def hanlp_cut(text):
    tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    return " ".join([term.word for term in tokenizer.segment(text)])
Beispiel #13
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-04 20:28
# 《自然语言处理入门》11.6 标准化评测
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
from pyhanlp import JClass
from tests.demos.demo_text_classification import sogou_corpus_path

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
LinearSVMClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')
FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
IDataSet = JClass('com.hankcs.hanlp.classification.corpus.IDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')
Evaluator = JClass(
    'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator')
FMeasure = JClass(
    'com.hankcs.hanlp.classification.statistics.evaluations.FMeasure')
BigramTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')
HanLPTokenizer = JClass(
    'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer')
ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer')


def evaluate(classifier, tokenizer):
    training_corpus = FileDataSet().setTokenizer(tokenizer).load(
        sogou_corpus_path, "UTF-8", 0.9)
Beispiel #14
0
        for i in range(1, len(seg_list)):
            bigram = seg_list[i - 1] + '@' + seg_list[i]
            if bigram in self.condition_data:
                self.condition_data[bigram] += 1
            else:
                self.condition_data[bigram] = 1

    def write(self, path):
        with open(path, 'w', encoding='utf8') as f:
            for key, frequency in sorted(self.condition_data.items(),
                                         key=lambda x: x[1],
                                         reverse=True):
                string = key + ' ' + str(frequency) + '\n'
                f.write(string)


if __name__ == '__main__':
    p = Process()
    NShortSegment = JClass("com.hankcs.hanlp.seg.NShort.NShortSegment")
    nshort_segment = NShortSegment().enableCustomDictionary(
        False).enablePlaceRecognize(True).enableOrganizationRecognize(True)
    with open('data/test_data', encoding='utf8') as f:
        for line in f:
            line = re.sub('\s', '', line)
            if line:
                res = nshort_segment.seg(line)
                res = list(map(lambda x: re.sub('/.*', '', str(x)), res))
                print(res)
                p.process(res)
    p.write('./data/self.ngram.txt')
import re
from pyhanlp import HanLP, JClass

PerceptronLexicalAnalyzer = JClass(
    'com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer')
analyzer = PerceptronLexicalAnalyzer()

clean_re = [
    re.compile(r'via(.*?)$'),
    re.compile(r'([^(]+)$'),
    re.compile(r'「[^「]+」$'),
    re.compile(r'\([^\(]+\)$')
]


def clean(string):

    for regex in clean_re:
        string = regex.sub('', string)

    string = string.replace('(转)', '')\
             .replace('「转」', '')\
             .replace('图转', '')\
             .replace('(转', '')\
             .replace('(转', '')\
             .replace('【全文】', '')\
             .replace('9GAG', '')

    return string

Beispiel #16
0
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path


########################################################################################################################

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')

# 中文情感挖掘语料-ChnSentiCorp 谭松波
chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论",
                             "http://file.hankcs.com/corpus/ChnSentiCorp.zip")


def predict(classifier, text):
    print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text)))


if __name__ == '__main__':
    classifier = NaiveBayesClassifier()
Beispiel #17
0
import re
from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib.parse import parse_qs, urlparse, quote

from pyhanlp import HanLP, JClass
from pyhanlp.static import INDEX_HTML

SENTENCE = 'sentence'
TEMPLATE = 'Error'
HANLP_GOOGLE_UA = 'UA-XXXXX-X'
ENVIRON = os.environ.copy()
if "HANLP_GOOGLE_UA" in ENVIRON:
    HANLP_GOOGLE_UA = ENVIRON["HANLP_GOOGLE_UA"]
with open(INDEX_HTML, encoding='utf-8') as src:
    TEMPLATE = src.read()
lexical_analyzer = JClass('com.hankcs.hanlp.tokenizer.NLPTokenizer').ANALYZER


class S(BaseHTTPRequestHandler):
    def _set_headers(self):
        self.send_response(200)
        self.send_header('Content-type', 'text/html')
        self.end_headers()

    def write(self, text: str):
        self.wfile.write(text.encode())

    def do_GET(self):
        params = parse_qs(urlparse(self.path).query)
        self._set_headers()
        # {'text': ['I looove iparser!']}