def ConfirmMuseum(text, museum, textid): # nlp = BosonNLP('SeJUopMY.24669.6kCKU4ruI3ss') # nlp = BosonNLP('lMdMTyuV.24544.0VHv6klp6Pk6') nlp = BosonNLP('sjWBhf9i.24699.rQmsCad9c3Jv') try: flag = 0 text = text[0:1000] result = nlp.ner(text)[0] words = result['word'] entities = result['entity'] for entitie in entities: if entitie[2] == 'org_name': org_name = ''.join(words[entitie[0]:entitie[1]]) if museum in org_name: flag = 1 break elif entitie[2] == 'location': location = ''.join(words[entitie[0]: entitie[1]]) if museum in location: flag = 1 break if flag: print('Confirm!') return 1 else: print('Not!') return 0 except KeyError as e: print('exit in %s' % textid) print(e)
def Entity_extraction(text): nlp = BosonNLP("x-gOGutn.27554.G6_6QvdJafES") rest = nlp.ner(text)[0] print(rest) words = rest['word'] entities = rest['entity'] tags = rest['tag'] for entity in entities: print(" ".join(words[entity[0]:entity[1]]), entity[2])
class QueryParser(object): def __init__(self): self.nlp = BosonNLP(bosonnlp_token) def parse(self, query_string): """ input: 7月22号 北京到上海的高铁票 output: [{'entity': [[0, 3, 'time'], [3, 4, 'location'], [5, 6, 'location']], # 需要理解实体出现的模式,这块需要理解上下文 'tag': ['t', 'm', 'q', 'ns', 'p', 'ns', 'ude', 'n', 'n'], 'word': ['7月', '22', '号', '北京', '到', '上海', '的', '高铁', '票']}] """ result = self.nlp.ner(query_string)[0] words = result['word'] tags = result['tag'] entities = result['entity'] return (words, entities, tags) def get_entity(self, parsed_words, index_tuple): """ 获取已识别的实体 采用filter 参考 python cookbook部分 input: entities : 二元组 parsed_words : 解析好的词组 """ return parsed_words[index_tuple[0]:index_tuple[1]] def format_entities(self, entities): """ 给元组命名 """ namedentity = collections.namedtuple('namedentity', 'index_begin index_end entity_name') return [namedentity(entity[0], entity[1], entity[2]) for entity in entities] def get_format_time(self, time_entity): """ output {'timestamp': '2013-02-28 16:30:29', 'type': 'timestamp'} """ basetime = datetime.datetime.today() result = self.nlp.convert_time( time_entity, basetime) # print(result) timestamp = result["timestamp"] return timestamp.split(" ")[0]
def bosonnlpNER(news): from bosonnlp import BosonNLP nlp = BosonNLP('cKWUytiR.34676.f5F2YbS_EyX2') ner = nlp.ner(news)[0] print(ner) words = ner['word'] entity = ner['entity'] N = [] # record the entity start and end. k:v = start : end entity_start = {} for e in entity: if e[2] in {'org_name', 'person_name'}: entity_start[e[0]] = e[1] N.append([''.join(words[e[0]:e[1]]), e[2]]) return N, entity_start, words
class QueryParser(object): def __init__(self): self.nlp = BosonNLP(bosonnlp_token) def parse(self, query_string): """ input: 7月22号 北京到上海的高铁票 output: [{'entity': [[0, 3, 'time'], [3, 4, 'location'], [5, 6, 'location']], # 需要理解实体出现的模式,这块需要理解上下文 'tag': ['t', 'm', 'q', 'ns', 'p', 'ns', 'ude', 'n', 'n'], 'word': ['7月', '22', '号', '北京', '到', '上海', '的', '高铁', '票']}] """ result = self.nlp.ner(query_string)[0] words = result['word'] tags = result['tag'] entities = result['entity'] return (words,entities,tags) def get_entity(self,parsed_words,index_tuple): """ 获取已识别的实体 采用filter 参考 python cookbook部分 input: entities : 二元组 parsed_words : 解析好的词组 """ return parsed_words[index_tuple[0]:index_tuple[1]] def format_entities(self,entities): """ 给元组命名 """ namedentity = collections.namedtuple('namedentity','index_begin index_end entity_name') return [namedentity(entity[0],entity[1],entity[2]) for entity in entities] def get_format_time(self,time_entity): """ output {'timestamp': '2013-02-28 16:30:29', 'type': 'timestamp'} """ basetime = datetime.datetime.today() result = self.nlp.convert_time( time_entity, basetime) #print(result) timestamp = result["timestamp"] return timestamp.split(" ")[0]
def getAnswerEntities(text_set, api_key, level): def f(x): return { '0': 'location', '1': 'person_name', '2': 'product_name', '3': ('org_name', 'company_name'), '4': ('product_name', 'org_name', 'company_name'), }[str(x)] nlp = BosonNLP(api_key) result = nlp.ner(text_set)[0] words = result['word'] entities = result['entity'] ul = [] for entity in entities: if (entity[2] == f(level) or entity[2] in f(level)): ul.append(''.join(words[entity[0]:entity[1]])) keys = sortList(ul) return keys
class NERProcess(multiprocessing.Process): def __init__(self, nername, phrase_list, groupid=0): multiprocessing.Process.__init__(self) self.nername = nername self.phrase_list = phrase_list self.numofphrase = len(phrase_list) # batch ID, and will be used for file name self.group_id = str(groupid) # load NER modules self.boson_ner = BosonNLP("bJ0hvqpK.21947.dpf19nyJfNHp") #self.conn = self.boson_ner.connect() #self.ltp_ner = LTPNer() self.jsonData = {} print "creating subprocess : " + self.nername + ":" + self.group_id + ", number of phrase: " + str( self.numofphrase) def run(self): print "subprocess " + self.nername + ":" + self.group_id + " started @ " + time.ctime( ) jsonList = [] for iter in range(self.numofphrase): raw_text = self.phrase_list[iter].text raw_text = raw_text.encode("utf-8", "error") document_id = str(self.phrase_list[iter].document_id) phrase_id = str(self.phrase_list[iter].id) boson_json = self.boson_ner.ner(raw_text) with open("./boson_result/" + document_id, "w") as f: json.dump(boson_json, f, ensure_ascii=False) print "subprocess " + self.nername + ":" + self.group_id + " ended @ " + time.ctime( )
def boson_nre_testCase(): nlp=BosonNLP(API_TOKEN) os.chdir(TXT_PATH) fd=codecs.open("1.txt",'r',encoding='utf-8',errors="ignore",buffering=1) docText=fd.read() result=nlp.ner(docText)[0] words=result['word'] # tags=result['tag'] entities=result['entity'] # print len(words)," ",len(tags)," ",len(entities) # print words # print "\n" # print tags # print "\n" # print len(entities) # print entities # print "\n" docEntityList=[] for i in range(len(entities)): mergedToken=u"" for j in range(entities[i][0],entities[i][1],1): mergedToken=mergedToken+words[j] token_candidate=(mergedToken,str(entities[i][2])) #check for redundant token, only add new token into the entity_list flag=0 for k in range(len(docEntityList)): if (token_candidate[0]==docEntityList[k][0]) and \ (token_candidate[1]==docEntityList[k][1]): flag=flag+1 if flag==0: docEntityList.append(token_candidate) # return docEntityList print "==============" print len(docEntityList) for i in range(len(docEntityList)): print docEntityList[i][0]," ","type:",docEntityList[i][1]
def boson_nre_batch(): os.chdir("E:\\wordDir") fd_out=codecs.open("EntityList.txt",'a',encoding="utf-8",errors='ignore',buffering=1) nlp=BosonNLP(API_TOKEN) entityCollection=[] os.chdir(TXT_PATH) for files in os.listdir(os.getcwd()): fd=codecs.open(files,"r",encoding="utf-8") docText=fd.read() result=nlp.ner(docText)[0] words=result['word'] # tags=result['tag'] entities=result['entity'] docEntityList=[] for i in range(len(entities)): mergedToken=u"" for j in range(entities[i][0],entities[i][1],1): mergedToken=mergedToken+words[j] token_candidate=(mergedToken,str(entities[i][2]),files.decode('gb2312')) #check for redundant token, only add new token into the entity_list flag=0 for k in range(len(docEntityList)): if (token_candidate[0]==docEntityList[k][0]) and \ (token_candidate[1]==docEntityList[k][1]): flag=flag+1 if flag==0: docEntityList.append(token_candidate) print files.decode('gb2312')," ",len(docEntityList) entityCollection.append(docEntityList) for i in range(len(entityCollection)): for j in range(len(entityCollection[i])): outputStr=entityCollection[i][j][0]+"||"+entityCollection[i][j][1]+"||"+entityCollection[i][j][2]+"\n" fd_out.write(outputStr) fd_out.close()
# person_name -> PER # org_name, company_name, product_name -> ORG from bosonnlp import BosonNLP import os api = "avzp5h2G.21940.kBiq3cew8Oct" #nlp = BosonNLP(os.environ[api]) nlp = BosonNLP(api) Input_file = './data/original.txt' Output_file = './data/output_news.txt' # 得到分句,返回的是句子组成的列表 Input_file = open(Input_file, 'r', encoding='utf-8').read() entity, tag, word = nlp.ner(Input_file) LOC_type = ['location'] PER_type = ['person_name'] ORG_type = ['org_name', 'company_name', 'product_name'] with open(Output_file, 'w', encoding='utf-8') as f: lastindex = 0 for s, t, entity_type in entity: entity_name = [] if s > lastindex: entity_name = word[lastindex:s] entity_name = ''.join(entity_name) f.write(entity_name + ' ' + 'O') f.write('\n') lastindex = t
def Entity_extraction(sentence): #文本实体分析 nlp = BosonNLP('TPDuivpZ.27572.rVuPCI9-kUlN') result = nlp.ner(sentence) List = [] for i in range(len(result[0]['entity'])): a = [] a.append(result[0]['word'][result[0]['entity'][i][0]]) a.append(result[0]['entity'][i][2]) List.append(a) location_list = [] #地名 time_list = [] #时间 person_name = [] #人名 job_list = [] #工作 for i in range(len(List)): if List[i][1] == 'location': location_list.append(List[i][0]) if List[i][1] == 'time': time_list.append(List[i][0]) if List[i][1] == 'person_name': person_name.append(List[i][0]) if List[i][1] == 'job_title': job_list.append(List[i][0]) location_list = list(set(location_list)) time_list = list(set(time_list)) person_name = list(set(person_name)) job_list = list(set(job_list)) location = {} location['地名'] = location_list time = {} time['时间'] = time_list person = {} person['人名'] = person_name job = {} job['工作'] = job_list print('地名:{}\n\n时间:{}\n\n人名:{}\n\n工作:{}'.format(location['地名'], time['时间'], person['人名'], job['工作'])) # 绘制文本实体分析 plot.rcParams['font.sans-serif'] = ['SimHei'] plot.rcParams['axes.unicode_minus'] = False DG = nx.DiGraph() plot.figure(figsize=(8, 8)) plot.subplot(1, 1, 1) DG.add_nodes_from(['文本', '地名', '时间', '人名', '工作']) DG.add_edges_from([('文本', '地名'), ('文本', '人名'), ('文本', '工作'), ('文本', '时间')]) location_next = location['地名'] DG.add_nodes_from(location_next) for i in range(len(location_next)): DG.add_edge('地名', location_next[i]) time_next = time['时间'] DG.add_nodes_from(time_next) for i in range(len(time_next)): DG.add_edge('时间', time_next[i]) person_next = person['人名'] DG.add_nodes_from(person_next) for i in range(len(person_next)): DG.add_edge('人名', person_next[i]) job_next = job['工作'] DG.add_nodes_from(job_next) for i in range(len(job_next)): DG.add_edge('工作', job_next[i]) colors = ['red', 'deepskyblue', 'magenta', 'limegreen', 'dimgrey'] for i in range(len(location_next)): colors.append('lightblue') for i in range(len(time_next)): colors.append('plum') for i in range(len(person_next)): colors.append('lightgreen') for i in range(len(job_next)): colors.append('darkgray') nx.draw(DG, with_labels=True, node_size=700, node_color=colors) plot.title('文本实体分析', color='red', fontsize=20) plot.show()
# -*- encoding: utf-8 -*- from __future__ import print_function, unicode_literals from bosonnlp import BosonNLP # 测试Boson实体识别API # 注意:在测试时请更换为您的API token nlp = BosonNLP('Reg0KvHM.17970.YFwdM3sID8xa') test = open("地标.txt") try: correct = 0 list_of_test = [] for line in test.readlines()[100:104]: line = line.strip('\n') list_of_test.append(line) for entity in list_of_test: result = nlp.ner(entity)[0] words = result['word'] entities = result['entity'] for entity in entities: if entity[2] == 'location' or entity[2] == 'org_name' or entity[ 2] == 'company_name': correct = correct + 1 else: print(''.join(words[entity[0]:entity[1]]), entity[2]) precision = correct / len(list_of_test) print(correct, precision) finally: test.close()
mytoken = 'wHXup4Wh.13586.tkz9YxHxkO_o' nlp = BosonNLP(mytoken) #文本初始化 i = 77 id = rlaq_u2.loc[i, 'id'] ajbh = rlaq_u2.loc[i, 'ajbh'] fssj = pd.to_datetime(rlaq_u2.loc[i, 'time']) txt = rlaq_u2.loc[i, 'jyaq'] txt0 = txt place = rlaq_u2.loc[i, 'place'] print(txt0) #提取时间、地点、人物 result = nlp.ner(txt)[0] words = result['word'] entities = result['entity'] Btime = [] Bplace = [] Bpeople = [] for entity in entities: if entity[2] == 'time': Btime.append(''.join(words[entity[0]:entity[1]])) if entity[2] == 'location': Bplace.append(''.join(words[entity[0]:entity[1]])) if entity[2] == 'person_name': Bpeople.append(''.join(words[entity[0]:entity[1]])) print('时间:', ' | '.join(Btime)) print('地点:', ' | '.join(Bplace)) print('人物:', ' | '.join(Bpeople))
# -*- encoding: utf-8 -*- from bosonnlp import BosonNLP import os #reference from http://bosonnlp-py.readthedocs.io/#bosonnlp-py nlp = BosonNLP('bosonnlp的API') # or nlp = BosonNLP(os.environ['BOSON_API_TOKEN']) nlp.ner('你好啊', sensitivity=2) nlp.ner(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休']) result = nlp.tag('成都商报记者 姚永忠') format_tag_result(result[0]) result = nlp.tag(['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽'], oov_level=0) result = nlp.tag("成都商报记者 姚永忠", space_mode=2)
cnt = 0 flag = 0 for line in Input_file: cnt += 1 print(len(Input_file), cnt) if index < 78: text.append(line) index += 1 if (cnt == len(Input_file)): flag = 1 #print (index, line) if (index >= 78) or (flag == 1): index = 0 text_str = '\n'.join(text) print(text_str) ner_dict = nlp.ner(text_str) print(ner_dict) entity = ner_dict[0]['entity'] word = ner_dict[0]['word'] tag = ner_dict[0]['tag'] print(entity, word) entities.append(entity) tags.append(tag) words.append(word) text = [] #text.append(line) with open(Output_file, 'w', encoding='utf-8') as f: for entity, word in zip(entities, words): print(entity, word) lastindex = 0
# -*- encoding: utf-8 -*- from __future__ import print_function, unicode_literals from bosonnlp import BosonNLP # 注意:在测试时请更换为您的API token nlp = BosonNLP('VaUKhf7X.7870.xbHiGWB_gx49') s = ['中新网周口9月15日电(刘鹏) 15日,针对媒体报道的河南省太康县女子在当地一家KTV遭3名协警暴力殴打一事,太康县警方向记者回复称,3名打人者中两名为协警身份,其中一名协警未参与打架,但目前两名协警均被辞退。而当晚一同前往KTV娱乐的一名正式女民警被关禁闭。 据之前媒体报道,今年9月4日晚11时左右,太康县一家KTV内,一名姜姓女士在送走一位朋友后正返回KTV时,在门口碰到正从里面出来的三名男子。其中一名男子对姜女士动手动脚,另一男子则说姜女士为“小姐”。 受到羞辱的姜女士要求对方赔礼道歉。没想到竟遭到了三名男子拳脚相加。据姜女士反映,事发当晚黑衣男子对她一番推搡致其头部撞到门上;绿衣男子则直接拽着她的头发将其摁倒在地,随后又遭到了拳头打脸、脚踹并拉着衣服将其头往门上撞。姜女士试图报警,结果三名男子将其手机夺走摔到地上。为了阻止围观群众报警,白衣男子直接拿出“警官证”,称自己是刑警队人员,若是报警,不把录像删了,就把KTV店给砸了。 15日上午,太康县公安局发布对此事件的调查处理通报。通报称,9月4日晚,葛某(太康县人,无业)、师某(协警)等人到盛世年华夜总会唱歌,当晚23时结束后,师某、葛某与姜某发生争执吵骂,并引起厮打,致使姜某轻微伤。目前双方已达成调解协议,姜某对师某、葛某达成谅解。 太康县公安局负责处理此事的王姓警官透露,事发当晚,和打人者葛某、师某一同前往KTV娱乐的还有该局一名刚入职不久的女民警李某某及协警司某等人,但他们并未参与打架。后经太康县公安局党委研究决定,对违规进入娱乐场所的民警李某某先行禁闭,待调查结束后再做处理;对违规进入娱乐场所的协警师某、司某予以辞退。' '纪检部门仍在调查之中。成都商报记者 姚永'] result = nlp.ner(s)[0] words = result['word'] entities = result['entity'] for entity in entities: print(''.join(words[entity[0]:entity[1]]), entity[2]) print(s) result = nlp.sentiment(s) print(result)
class BosonNlpp: def __init__(self): self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB') #情感分析 def testSentiment(self, s): result = self.bonlp.sentiment(s) return result #print(result) #命名实体识别 def lexicalAnalysis(self, s): result = self.bonlp.ner(s)[0] return result #依存文法分析 def textDependency(self, s): result = self.bonlp.depparser(s) return result #关键词提取 def testKeywords(self, s): result = self.bonlp.extract_keywords(s, top_k=10) return result #新闻分类 def textClassify(self, s): resultlist = self.bonlp.classify(s) classifys = { 0: '体育', 1: '教育', 2: '财经', 3: '社会', 4: '娱乐', 5: '军事', 6: '国内', 7: '科技', 8: '互联网', 9: '房产', 10: '国际', 11: '女人', 12: '汽车', 13: '游戏' } return (classifys[resultlist[0]]) #语义联想 def lexicalSynonym(self, term): result = self.bonlp.suggest(term, top_k=10) return result #分词与词性标注 def fenci(self, s): result = self.bonlp.tag(s) return result def newssubstract(self, s): #s=s.encode('utf8') s = s.decode('utf-8') result = self.bonlp.summary('', s) return result
def bosonNer(text, sensitivity): nlp = BosonNLP('O8M_j1Nd.4200.wIlhsL46w9-C') return nlp.ner(text, sensitivity)
def bosonNer(text, sensitivity): nlp = BosonNLP('qJWJc-f3.4334.MamzfHZ-9wUL') return nlp.ner(text, sensitivity)
### 4. 命名实体的识别 from bosonnlp import BosonNLP words_list = list() nlp = BosonNLP('g8lQg9Mv.25818.fAbbwt6TYhh8') # 使用token result = nlp.tag('承德市长江大桥') print(result) print(result[0]['word']) print(result[0]['tag']) for i in range(len(result[0]['word'])): print(result[0]['word'][i] + '/' + result[0]['tag'][i], end=' ') print() print(' '.join([a + '/' + b for a, b in zip(result[0]['word'], result[0]['tag'])])) # sensitivity (int 默认为 3) – 准确率与召回率之间的平衡, 设置成 1 能找到更多的实体,设置成 5 能以更高的精度寻找实体。 sentence = '美国国防部发言人威廉斯说,伊拉克持续将五艘共约载有万桶原油的超级油轮,与距科威特海岸五公里处的海岛石油转运站的原油倾入北波斯湾。' result = nlp.ner(sentence, sensitivity = 2) print(result[0]['word']) print(result[0]['tag']) print(result[0]['entity']) for s, e, entity in result[0]['entity']: print('%-14s\\t%s' % (''.join(result[0]['word'][s:e]), entity))"