def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
def getRelationShipDic(): asymmetricInFilePath = inout.getResourcePath('asymmetricRelationShip.txt') symmetricInFilePath = inout.getResourcePath('symmetricRelationShip.txt') infoList = inout.readListFromTxt(asymmetricInFilePath) infoList.extend(inout.readListFromTxt(symmetricInFilePath)) print '归一化总关系数量:', len(infoList) # 初始化持久化对象字典 initDic = dict() for lineItem in infoList: lineList = lineItem.strip().split('\t') key = lineList[0].strip() valueList = lineList[-1].strip()[1:-1].replace(' ', '').split(',') ## 这是处理的第一种方法 initDic[key] = valueList ## 还可以有第二中方法 return initDic
''' ''' resultStr = '' for item in labelWordWeightList: resultStr = resultStr + '(' + str(item[0]) + ',' + str( item[1]) + ')' + ',' return resultStr if __name__ == '__main__': analysisPath = inout.getDataAnalysisPath('analysis.txt') ## 加载停用词列表 stopWordPath = inout.getResourcePath('stopWordList.txt') stopWordList = inout.readListFromTxt(stopWordPath) # 这地方加入临时逻辑,后面可以进行停用词合并 stopWordList = list(set(stopWordList)) ## 加载关系字典 relationDic = persistent_relation_object.getRelationShipDic() pd.set_option('display.width', 300) np.set_printoptions(linewidth=300, suppress=True) # corpusPath = inout.getDataOriginPath('special_corpus_copy.txt') corpusPath = inout.getDataOriginPath('special_corpus.txt') ## 1 对于复杂的文本数据要进行清洗 # 分句可以用在初次清晰文本过程中
# -*- coding:utf-8 -*- from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer from utils import inout import index if __name__ == '__main__': segmentor = Segmentor() segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) infoList = inout.readListFromTxt('./dn_test.txt') for sentence in infoList: # segmentor.load(inout.getLTPPath(index.CWS)) words = segmentor.segment(sentence) postags = postagger.postag(words) # result = zip(words,postags) # inout.printEscapeStr(result) segmentor.release() postagger.release() # recognizer = NamedEntityRecognizer() # recognizer.load(inout.getLTPPath(index.NER))
检测ltp是否识别出命名实体 """ if __name__ == '__main__': # testLine = '著名相声家成龙的师傅是马季。' while True: testLine = raw_input('请输入字符串:(-1退出)') namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(testLine) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t')