def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
def getRelationShipDic():
    asymmetricInFilePath = inout.getResourcePath('asymmetricRelationShip.txt')
    symmetricInFilePath = inout.getResourcePath('symmetricRelationShip.txt')

    infoList = inout.readListFromTxt(asymmetricInFilePath)
    infoList.extend(inout.readListFromTxt(symmetricInFilePath))
    print '归一化总关系数量:', len(infoList)

    # 初始化持久化对象字典
    initDic = dict()

    for lineItem in infoList:
        lineList = lineItem.strip().split('\t')
        key = lineList[0].strip()
        valueList = lineList[-1].strip()[1:-1].replace(' ', '').split(',')
        ## 这是处理的第一种方法
        initDic[key] = valueList
        ## 还可以有第二中方法

    return initDic
Esempio n. 3
0
    '''

    '''
    resultStr = ''
    for item in labelWordWeightList:
        resultStr = resultStr + '(' + str(item[0]) + ',' + str(
            item[1]) + ')' + ','
    return resultStr


if __name__ == '__main__':

    analysisPath = inout.getDataAnalysisPath('analysis.txt')

    ## 加载停用词列表
    stopWordPath = inout.getResourcePath('stopWordList.txt')
    stopWordList = inout.readListFromTxt(stopWordPath)
    # 这地方加入临时逻辑,后面可以进行停用词合并
    stopWordList = list(set(stopWordList))

    ## 加载关系字典
    relationDic = persistent_relation_object.getRelationShipDic()

    pd.set_option('display.width', 300)
    np.set_printoptions(linewidth=300, suppress=True)

    # corpusPath = inout.getDataOriginPath('special_corpus_copy.txt')
    corpusPath = inout.getDataOriginPath('special_corpus.txt')

    ## 1 对于复杂的文本数据要进行清洗
    # 分句可以用在初次清晰文本过程中
# -*- coding:utf-8 -*-

from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
from utils import inout
import index

if __name__ == '__main__':

    segmentor = Segmentor()
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt'))
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))

    infoList = inout.readListFromTxt('./dn_test.txt')

    for sentence in infoList:

        # segmentor.load(inout.getLTPPath(index.CWS))
        words = segmentor.segment(sentence)
        postags = postagger.postag(words)
        # result = zip(words,postags)
        # inout.printEscapeStr(result)


    segmentor.release()
    postagger.release()

    # recognizer = NamedEntityRecognizer()
    # recognizer.load(inout.getLTPPath(index.NER))
    检测ltp是否识别出命名实体
"""

if __name__ == '__main__':

    # testLine = '著名相声家成龙的师傅是马季。'

    while True:
        testLine = raw_input('请输入字符串:(-1退出)')

        namedEntityTagTupleList = []

        segmentor = Segmentor()
        # segmentor.load(inout.getLTPPath(index.CWS))
        segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                    inout.getResourcePath('userDic.txt'))
        words = segmentor.segment(testLine)
        segmentor.release()
        postagger = Postagger()
        postagger.load(inout.getLTPPath(index.POS))
        postags = postagger.postag(words)
        postagger.release()
        recognizer = NamedEntityRecognizer()
        recognizer.load(inout.getLTPPath(index.NER))
        netags = recognizer.recognize(words, postags)
        recognizer.release()

        for word, netag in zip(words, netags):
            namedEntityTagTupleList.append((word, netag))

        neTagList = '\t'.join(netags).split('\t')