def seg(content): # Set your own model path MODELDIR="/home/liuqi/ltp/pyltp/ltp_data/" segmentor = Segmentor() segmentor.load(MODELDIR+"cws.model") tWords = segmentor.segment(content) return tWords
def split_words(sentence = "中国进出口银行与中国银行加强合作",type_list=0): """分词,若type_list=True,则返回以列表返回分词后的结果。""" segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) if type_list: return [i for i in words] return words
class pnn_count(): def __init__(self): self.mydict = {} self.segmentor = Segmentor() self.segmentor.load('cws.model') self.hash_dict() self.ltp_process() def ltp_process(self): sentence_num = 0 right_num = 0; f = open('pnn_annotated.txt','r') for line in f: sentence_num += 1 #print line line_array = line.split('\t') line = line_array[1] count = 0 words = self.segmentor.segment(line) for i in words: if self.mydict.has_key(i): count = count + self.mydict[i] if count > 0: answer = "positive" if line_array[0] == '1': right_num += 1 elif count == 0: answer = "neuter" if line_array[0] == '0': right_num += 1 else: answer = "negative" if line_array[0] == '-1': right_num += 1 #print "My guess is %s" %answer #print "THe right answer is %s" %line_array[0] #print "result %d" % count f.close() print "total sentence is %d, right answer is %d" %(sentence_num,right_num) def hash_dict(self): f = open('negative.txt','r') for line in f: line = line.strip('\n') line = line.strip('\r') self.mydict[line] = -1 f.close() f = open('positive.txt','r') for line in f: line = line.strip('\n') line = line.strip('\r') self.mydict[line] = 1 f.close()
def process(index): ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) finname = "o_"+str(index)+".txt" foutname = "p_"+str(index)+".txt" print finname count = 0 fin = codecs.open(finname, encoding='utf-8') with codecs.open(foutname, 'w', encoding="utf-8") as fout: while 1: line = fin.readline() if not line: break tmp = line.split(" ^ {")[1] # Get JSON tmp = "{"+tmp data = json.loads(tmp) content = data['content'] # error_correction(content) content = content.strip() segmentation = "" for line in content.split("\n"): line = line.encode("utf-8") words = segmentor.segment(line) segmentation += "/".join(words) segmentation += "/" # Return type of the function is str, not unicode. Thus need to change into unicode. segmentation = unicode(segmentation, "utf-8") pinyin = add_pinyin(segmentation) obj = {} obj['flavor'] = data['flavor'] obj['environment'] = data['environment'] obj['service'] = data['service'] obj['content'] = data['content'] obj['segmentation'] = segmentation obj['pinyin'] = pinyin tmpstr = json.dumps(obj,ensure_ascii=False) fout.write(tmpstr) fout.write('\n') count += 1 print count segmentor.release()
def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
def pyltp_words(): from pyltp import Segmentor, Postagger segmentor = Segmentor() segmentor.load("/home/fredgan/github/pyltp/ltp_data/cws.model") # postagger = Postagger() # postagger.load("~/github/pyltp/ltp_data/cpos.model") for line in open(sys.argv[1], 'r'): try: style,sentence = line.strip().split('\t') except: continue style_dic.setdefault(style, {}) words = segmentor.segment(sentence) # postags = postagger.postag(words) for w in words: if w in style_dic[style]: style_dic[style][w] += 1 else: style_dic[style][w] = 1 for k,v in style_dic.iteritems(): v_list = sorted(v.iteritems(), key = lambda d:d[1], reverse = True) print k+ "\t" + " ".join(map(lambda i:i[0] + ":" +str(i[1]), v_list[0:min(50,len(v_list))]))
class EventInfoExtract(): def __init__(self,modulePath,outfile): self.MODELDIR = modulePath self.adict = { '·' :'', '的':'', '了':'', '“':'', '”':'', '一次':'' } self.segmentor=None self.postagger=None self.parser=None self.recognizer=None self.out_file=outfile def multiple_replace(self,text): rx = re.compile('|'.join(map(re.escape, self.adict))) def one_xlat(match): return self.adict[match.group(0)] return rx.sub(one_xlat, text) def InitModule(self): #print "正在加载LTP模型... ..." self.segmentor = Segmentor() #print os.path.join(self.MODELDIR, "cws.model") self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) #分词模型,单文件 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) #词性标注模型,单文件 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) #依存句法分析模型,单文件 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) #命名实体识别模型,单文件 #print self.recognizer def release_module(self): ''' release the model ''' self.segmentor.release() self.segmentor=None self.postagger.release() self.postagger=None self.parser.release() self.parser=None self.recognizer.release() self.recognizer=None def Txtextraction_start(self,txt,out_file): """ 事实三元组的控制程序 Args: txt:带抽取的内容 """ txt = txt.strip() out_file = open(self.out_file, 'a') #try: #print "Execute here====-====" self.fact_triple_extract(txt,out_file) out_file.flush() out_file.close() def addresssTime_extract(self,inputtxt): #这个地方先做实体抽取,提取出人物、组织和相关的时间,首先分词,得到分词结果 #words = self.segmentor.segment(inputtxt) sentences = inputtxt.split('。') #print sentences DataAndTime=[] for sentence in sentences: if len(sentence)<=1: continue #sentence = u"北京是中国首都" words = self.segmentor.segment(sentence) #print '\t'.join(words) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) #print '\t'.join(postags) arcs = self.parser.parse(words, postags) #print "sentence;===========132123123123123" Dt={'date':'','address':''} if (("发生" in sentence or "遭" in sentence) and ("爆炸" in sentence or "事件" in sentence or "袭击" in sentence )) or (("恐怖" in sentence) or ("袭击" in sentence)): Flag=False #print '\t'.join(words) #print '\t'.join(postags) #print '\t'.join(postags) Addressbackups=[] Address ='' for i in range(len(postags)-1): if Flag==True: if postags[i]=='ns'or postags[i]=='nd' or postags[i]=='n': # ns 地理名 nd方向名词 n一般名词 head = arcs[i].head Address=Address+words[i] if postags[head-1]=="n": Address+=words[head-1] head = arcs[head-1].head if(words[head-1]=="在" or words[head-1]=="发生" or words[head-1]=="袭击" or words[head-1]=="遭" or words[head-1]=="遭遇" or words[head-1]=="将"): Dt['address']=Address break else: print "地址,",Address Addressbackups.append(Address) Address='' Flag=False continue if postags[i]=='ns' and Flag == False: #这个地方只会第一次进来。 head = arcs[i].head Address = Address+words[i] if (words[head-1]=="在" or words[head-1]=="发生" or words[head-1]=="遭" or words[head-1]=="遭遇" or words[head-1]=="将"): Dt['address']=Address break #if postags[i+1]!='ns' or postags[i+1]!='nd' or postags[i+1]!='n': # print "wewewerwer====,",Address # Addressbackups.append(Address) Flag = True #print Addressbackups[0] if ("月" in sentence or '日' in sentence) and ("发生" in sentence or "袭击" in sentence): Flag = False Date='' Datebackup=[] for i in range(len(postags)-1): if Flag==True: if postags[i]=='nt': #print words[i] head = arcs[i].head Date=Date+words[i] if words[head-1]=="发生" or words[head-1]=="袭击": Dt['date']=Date break else: Datebackup.append(Date) Date='' Flag=False continue if postags[i]=='nt' and Flag == False: Date = Date+words[i] #获取一下head head = arcs[i].head if words[head-1]=="发生" or words[head-1]=="袭击": Dt['date']=Date break if postags[i+1]!='nt': Datebackup.append(Date) #index=i Flag = True if Dt['date']=='' and len(Datebackup): Dt['date']=Datebackup[-1] if Dt['date']!='' or Dt['address']!='': DataAndTime.append(Dt) if len(DataAndTime)>1: for i in DataAndTime: if i['date']=="当天": DataAndTime.remove(i) if len(DataAndTime)==0: Dt['date']='' Dt['address']='' DataAndTime.append(Dt) return DataAndTime def extraction_start(self, input_txt,out_file_name): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ #in_file = open(in_file_name, 'r') out_file = open(out_file_name, 'a') line_index = 1 sentence_number = 0 text_line = input_txt while text_line: if line_index < begin_line: text_line = in_file.readline() line_index += 1 continue if end_line != 0 and line_index > end_line: break sentence = text_line.strip() if sentence == "" or len(sentence) > 1000: text_line = in_file.readline() line_index += 1 continue try: sentence_one = sentence.split(" ")#"。" for num in range(len(sentence_one)-1): self.fact_triple_extract(sentence, out_file) out_file.flush() except: pass sentence_number += 1 if sentence_number % 50 == 0: print "%d done" % (sentence_number) text_line = in_file.readline() line_index += 1 in_file.close() out_file.close() def attribute_define0(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: print "事件属性:","".join(words[index-i-1:index+1]) break def attribute_define1(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: if(i != 0): print "事件属性:","".join(words[index-i-1:index+1]) break def num_define(self,text): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 for index in range(len(words)): if(postags[index]=='m'): return words[index] def attribute_define2(self,text,keywords): words = self.segmentor.segment(text) #postags = postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'): continue else: if(i != 0): attribute = "".join(words[index-i:index+1]) #attribute = multiple_replace(attribute) print '===========' if attribute in '恐怖袭击事件': return return attribute else: return def organization_define(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-1][0]=='n')&(index-i-1 != 0): continue else: if(words[index-1]=='组织')&(postags[index-2][0]!='n'): continue if(i != 0): print "组织:","".join(words[index-i:index]) return "".join(words[index-i:index]) def organization_define1(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-1][0]=='n')&(index-i-1 != 0): continue else: if(words[index-1]=='组织')&(postags[index-2][0]!='n'): continue if(i != 0): #print "组织:","".join(words[index-i:index]) return "".join(words[index-i:index]) def fact_attribute_from_text(self,text): """ """ text = text.replace(',','。') sentence_one = text.split("。") fact_attribute = [] for num in range(len(sentence_one)-1): if('袭击' in sentence_one[num]): #attribute_define0(sentence_one[num],'事件') #print sentence_one[num] sentence_temp = self.multiple_replace(sentence_one[num]) if('发生' in sentence_temp)|('遭遇' in sentence_temp): #print '---------------',sentence_temp temp_atrribut1 = self.attribute_define2(sentence_temp,'事件') #print temp_atrribut1 if((temp_atrribut1)==None): temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击') #print temp_atrribut2 if temp_atrribut2==None: return fact_attribute.append(str(temp_atrribut2)) else: fact_attribute.append(str(temp_atrribut1)) #print '------------------' if(len(fact_attribute)==0): #print '事件属性:unkown!' return 'None' else: #print '事件属性1:', len(fact_attribute),''.join(fact_attribute) #print '事件属性:',max(fact_attribute, key=len) return max(fact_attribute, key=len) def organization_from_text(self,text): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ sentence_one = text.split("。") #print '---------------------------',sentence_one[0] ogniz = [] for num in range(len(sentence_one)-1): if('负责' in sentence_one[num]): if('宣称' in sentence_one[num]): #print sentence_one[num] sentence_temp = sentence_one[num].replace('“','') sentence_temp = sentence_temp.replace('”','') temp_org = self.organization_define(sentence_temp,'宣称') if(temp_org != None): ogniz.append(temp_org) if(len(ogniz)==0): if('宣称' in sentence_one[num]): #print sentence_one[num] sentence_temp = sentence_one[num].replace('“','') sentence_temp = sentence_temp.replace('”','') temp_org = self.organization_define1(sentence_temp,'宣称') if(temp_org != None): ogniz.append(temp_org) if(len(ogniz)==0): #print '组织:unkown!' return 'unknown' else: #print '组织:',max(ogniz, key=len) #print ogniz return max(ogniz, key=len) def death_num_from_text(self,text): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ text = text.replace(',','。') text = text.replace('、','。') sentence_one = text.split("。") death_num = None hurt_num = None total_num = None #print '---------------------------',sentence_one[0] for num in range(len(sentence_one)-1): if('死亡' in sentence_one[num])|('丧生' in sentence_one[num]): #print sentence_one[num] if(death_num == None): death_num = self.num_define(sentence_one[num]) #print '死亡人数:',death_num if('受伤' in sentence_one[num]): #print sentence_one[num] if(hurt_num == None): hurt_num = self.num_define(sentence_one[num]) #print '受伤人数:',hurt_num if('伤亡' in sentence_one[num]): #print sentence_one[num] if(total_num == None): total_num = self.num_define(sentence_one[num]) #print type(death_num),type(hurt_num),type(total_num) return death_num,hurt_num,total_num def fact_triple_extract(self,sentence, out_file): #print sentence """ 对于给定的句子进行事实三元组抽取 Args: sentence: 要处理的语句 """ words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) child_dict_list = self.build_parse_child_dict(words, postags, arcs) Entity_Address=[] Entity_Name = [] for index in range(len(postags)): e1 = '' if netags[index][0] == 'S' or netags[index][0] == 'B': if 'Ns' in netags[index]: ni = index if netags[ni][0] == 'B': while netags[ni][0] != 'E': ni += 1 e1 = ''.join(words[index:ni+1]) else: e1 = words[ni] Entity_Address.append(e1) if "Nh" in netags[index]: ni = index if netags[ni][0]=='B': while netags[ni][0]!='E': ni+=1 e1= ''.join(words[index:ni+1]) else: e1=words[ni] Entity_Name .append(e1) Entity_Address = list(set(Entity_Address)) Entity_Name = list(set(Entity_Name)) for i in Entity_Name: print i AddressTp =[] LocateAddress = [] for index in range(len(postags)): # 抽取以谓词为中心的事实三元组 if postags[index] == 'v': child_dict = child_dict_list[index] # 主谓宾 Flag = False if child_dict.has_key('SBV') and child_dict.has_key('VOB'): e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2)) for address in Entity_Address: if address in e1 and ( ("袭击" in e1 or "袭击" in e2) or ("事件" in e2 or "事件" in e1)): for name in Entity_Name: if name in e1: Flag == False break else: Flag = True if Flag == True: for i in Entity_Address: if i in e1 or i in e2: AddressTp.append(i) out_file.flush() # 定语后置,动宾关系 if arcs[index].relation == 'ATT': if child_dict.has_key('VOB'): e1 = self.complete_e(words, postags, child_dict_list, arcs[index].head - 1) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_string = r+e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: #print "定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2) out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)) out_file.flush() # 含有介宾关系的主谓动补关系 if child_dict.has_key('SBV') and child_dict.has_key('CMP'): #e1 = words[child_dict['SBV'][0]] e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if child_dict_list[cmp_index].has_key('POB'): e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) #print "介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2) out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)) out_file.flush() # 尝试抽取命名实体有关的三元组 if netags[index][0] == 'S' or netags[index][0] == 'B': ni = index if netags[ni][0] == 'B': while netags[ni][0] != 'E': ni += 1 e1 = ''.join(words[index:ni+1]) else: e1 = words[ni] if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O': r = self.complete_e(words, postags, child_dict_list, arcs[ni].head-1) if e1 in r: r = r[(r.index(e1)+len(e1)):] if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O': e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1) mi = arcs[arcs[ni].head-1].head-1 li = mi if netags[mi][0] == 'B': while netags[mi][0] != 'E': mi += 1 e = ''.join(words[li+1:mi+1]) e2 += e if r in e2: e2 = e2[(e2.index(r)+len(r)):] if r+e2 in sentence: #print "人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2) out_file.write("人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2)) out_file.flush() AddressTp = list(set(AddressTp)) LocateAddress=AddressTp Tp = LocateAddress for i in LocateAddress: for k in AddressTp: if i!=k and (i in k): Tp.remove(i) address = '' for i in Tp: address+=i print "地点:",address def build_parse_child_dict(self,words, postags, arcs): """ 为句子中的每个词语维护一个保存句法依存儿子节点的字典 Args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ child_dict_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if child_dict.has_key(arcs[arc_index].relation): child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) #if child_dict.has_key('SBV'): # print words[index],child_dict['SBV'] child_dict_list.append(child_dict) return child_dict_list def complete_e(self,words, postags, child_dict_list, word_index): """ 完善识别的部分实体 """ child_dict = child_dict_list[word_index] prefix = '' if child_dict.has_key('ATT'): for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) postfix = '' if postags[word_index] == 'v': if child_dict.has_key('VOB'): postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if child_dict.has_key('SBV'): prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix def attribute_define0(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: print "事件属性:","".join(words[index-i-1:index+1]) break def attribute_define1(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: if(i != 0): print "事件属性:","".join(words[index-i-1:index+1]) break def attribute_define2(self,text,keywords): #print text words = self.segmentor.segment(text) print words #print self.segmentor #print '\t'.join(words) #postags = postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): #print words[index] if(words[index]==keywords): for i in range(index): if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'): continue else: if(i != 0): attribute = "".join(words[index-i:index+1]) if attribute in '恐怖袭击事件': return return attribute else: return def organization_define(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-1][0]=='n'): continue else: if(words[index-1]=='组织')&(postags[index-2][0]!='n'): continue if(i != 0): print "组织:","".join(words[index-i:index]) return "".join(words[index-i:index]) def fact_attribute(self,in_file_name, out_file_name, begin_line, end_line): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ in_file = open(in_file_name, 'r') out_file = open(out_file_name, 'a') line_index = 1 sentence_number = 0 text_line = in_file.readline() while text_line: #小于起始段的直接跳过 if line_index < begin_line: text_line = in_file.readline() line_index += 1 continue if end_line != 0 and line_index > end_line: break sentence = text_line.strip() #长段(大于1000)直接跳过 if sentence == "" or len(sentence) > 1000: text_line = in_file.readline() line_index += 1 continue sentence_one = sentence.split(" ")#"。" for num in range(len(sentence_one)-1): attribute_define0(sentence_one[num],'事件') attribute_define2(sentence_one[num],'袭击') sentence_number += 1 if sentence_number % 50 == 0: print "%d done" % (sentence_number) text_line = in_file.readline() line_index += 1 in_file.close() out_file.close() ''' def fact_attribute_from_text(text): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ text = text.replace(',','。') sentence_one = text.split("。") fact_attribute = [] for num in range(len(sentence_one)-1): if('袭击' in sentence_one[num]): #attribute_define0(sentence_one[num],'事件') #print sentence_one[num] sentence_temp = multiple_replace(sentence_one[num]) if('发生' in sentence_temp)|('遭遇' in sentence_temp): print '---------------',sentence_temp temp_atrribut1 = self.attribute_define2(sentence_temp,'事件') fact_attribute.append(str(temp_atrribut1)) if((temp_atrribut1)==None): temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击') fact_attribute.append(str(temp_atrribut2)) print '------------------' if(len(fact_attribute)==0): print '事件属性:unkown!' return 'unknown' else: print '事件属性1:', len(fact_attribute),fact_attribute print '事件属性:',max(fact_attribute, key=len) return max(fact_attribute, key=len) ''' '''
class NERTagger(object): def __init__(self, model_dir_path, com_blacklist): # 初始化相关模型文件路径 self.model_dir_path = model_dir_path self.cws_model_path = os.path.join(self.model_dir_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.model_dir_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.model_dir_path, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` # 初始化分词模型 self.segmentor = Segmentor() self.segmentor.load(self.cws_model_path) # 初始化词性标注模型 self.postagger = Postagger() self.postagger.load(self.pos_model_path) # 初始化NER模型 self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) # 初始化公司名黑名单 self.com_blacklist = set() with open(com_blacklist,'r',encoding='UTF-8') as f_com_blacklist: for line in f_com_blacklist: if len(line.strip()) > 0: self.com_blacklist.add(line.strip()) def ner(self, text, entity_dict): words = self.segmentor.segment(text) # 分词 post_tags = self.postagger.postag(words) ner_tags = self.recognizer.recognize(words, post_tags) # 命名实体识别 # print('\t'.join(words)) # print('\t'.join(post_tags)) # print('\t'.join(ner_tags)) # print('-' * 80) entity_list = [] entity = "" for word, post_tag, ner_tag in zip(words, post_tags, ner_tags): tag = ner_tag[0] entity_type = ner_tag[2:] if tag == 'S' : entity_list.append((word, entity_type)) elif tag in 'BIE': entity += word if tag == 'E': #判断公司名黑名单 if entity in self.com_blacklist: entity_list.append((entity, "n")) else: entity_list.append((entity, entity_type)) entity = "" elif tag == 'O': if post_tag == 'nt': entity += word else: if entity != "": entity_list.append((entity, 'nt')) entity = "" # 排除错误数字识别,例如“大宗” if post_tag == 'm' and not re.match("[0-9]+.*",word): post_tag = 'n' # 识别数字中的百分数 if post_tag == 'm' and re.match("[0-9.]+%",word): post_tag = 'mp' entity_list.append((word, post_tag)) entity_list = self._ner_tag_by_dict(entity_dict, entity_list) return NERTaggedText(text, entity_list) def _ner_tag_by_dict(self, entity_dict, entity_list): # for item in entity_dict.items(): # print("\t".join(item)) i = 0 while i < len(entity_list) - 1: has_entity = False for entity_len in range(4,1,-1): segment = "".join([ x[0] for x in entity_list[i:i+entity_len]]) # segment_uni = segment.decode('utf-8') segment_uni = segment if segment_uni in entity_dict: has_entity = True entity_list[i] = (segment,entity_dict[segment_uni]) del entity_list[i+1:i+entity_len] i = i + entity_len break if not has_entity: i += 1 return entity_list def __del__(self): self.segmentor.release() self.postagger.release() self.recognizer.release()
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity( "../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag = self.segmentor.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag: # 可能有错误 print('load model failed') def segment(self, sentence, entity_postag=dict()): words = self.segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) print(netags) for netag in netags: print(netag) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def dsfn1_2_3_4COO(self, sentence, item1, item2): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ if (item1.dependency == "ATT"): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT"): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and ( item.dependency == "ATT"): AttWordDict[item.ID] = item.lemma if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, AttWordStr, item2.lemma]) """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ """ 考虑DSFN2的情况 """ if item1.dependency == "SBV" and item1.head_word.postag == "v": pred1 = item1.head_word predDict = dict() predDict[pred1.ID] = pred1.lemma if item2.dependency == "VOB" and item2.head_word.postag == "v": pred2 = item2.head_word predDict[pred2.ID] = pred2.lemma if (len(predDict) == 1): PredWordStr = "" for i in predDict: PredWordStr += predDict[i] print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, PredWordStr, item2.lemma]) """ 新加,为了考虑“习近平视察和访问上海”的情况 """ if len(predDict) == 2: num = self.get_entity_num_between(pred1, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if word.dependency == "VOB" and word.head_word.ID == pred1.ID: flagVOB = False print("pred1:" + pred1.lemma + ",pred2:" + pred2.lemma + ",num:" + str(num)) if num == 0: if flagVOB == True: print("DSFN2三元组:(" + item1.lemma + "," + pred1.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred1.lemma, item2.lemma]) if flagSBV == True: print("DSFN2三元组:(" + item1.lemma + "," + pred2.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred2.lemma, item2.lemma]) """ DSFN3.0 """ pred = None prep = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word elif item1.dependency == "FOB" and item2.dependency == "POB": # 考虑介词为“被”的情况,如 “小王被小明所陷害” pred = item1.head_word prep = item2.head_word c = item1 item1 = item2 item2 = c if pred != None and prep != None: if prep.dependency == "ADV": if prep.head_word.ID == pred.ID: pred2 = None object = None objectForPred2 = None for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "VOB" and item.head_word.ID == pred.ID: object = item objectDict = dict() objectDict[object.ID] = object for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == object.ID: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + objectStr + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + objectStr, item2.lemma ]) if object == None: print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) """ DSFN4 """ pred = None prep = None prep1 = None pred2 = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word if prep.dependency == "CMP" and prep.head_word.postag == "v": pred2 = prep.head_word if pred2.ID == pred.ID: print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma ]) else: num = self.get_entity_num_between(pred1, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if word.dependency == "VOB" and word.head_word.ID == pred.ID: flagVOB = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: flag = True for word in sentence.words: if word.dependency == "CMP" and word.head_word.ID == pred.ID: prep1 = word if prep1 != None: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma ]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") if flagSBV == True: allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) else: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) if flagSBV == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) """ DSFN5 """ # self.dsfn5and6(rawSentence,sentence,item1,item2) return allTripes def get_entity_num_between(self, verb1, verb2, sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID - 1: if self.is_entity(sentence.words[i]): num += 1 i += 1 return num def is_entity(self, entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'i'] print(entry.lemma + " : " + entry.postag) if entry.postag in entity_postags: return True else: return False def dsfnAttCOO(self, sentence, item1, item2): item1Att = item1 item2Att = item2 while item1Att.dependency == "ATT": item1Att = item1Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2) if allTripe == None or len(allTripe) == 0: while item2Att.dependency == "ATT": item2Att = item2Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att) if allTripe == None or len(allTripe) == 0: allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att) for tripe in allTripe: if tripe[0] == item1Att.lemma: tripe[0] = item1.lemma if tripe[2] == item2Att.lemma: tripe[2] = item2.lemma return allTripe def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self, sentence, item1, item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[ 2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[ 0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStartCOO3(self, rawSentence, entity1, entity2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] lemmas = dsfn.segment(rawSentence) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) print(sentence.to_string()) for item in sentence.words: if (item.lemma == entity1): item1 = item if (item.lemma == entity2): item2 = item if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c itemCopy1 = item1 itemCopy2 = item2 allTripes = self.dsfnStartCOO2(sentence, item1, item2) if allTripes != None and len(allTripes) == 0: if item1.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni' ] and item1.dependency == "ATT": item1 = item1.head_word while item1.dependency == "ATT": item1 = item1.head_word if 'n' in item1.postag and item1.postag not in [ 'nh', 'ns', 'nz', 'ni' ]: pass else: item1 = itemCopy1 if item2.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni' ] and item2.dependency == "ATT": item2 = item2.head_word while item2.dependency == "ATT": item2 = item2.head_word if ('n' in item2.postag or 'q' in item2.postag) and item2.postag not in [ 'nh', 'ns', 'nz', 'ni' ]: pass else: item2 = itemCopy2 allTripes = self.dsfnStartCOO2(sentence, item1, item2) print("注意") print(allTripes) if len(allTripes) != 0: for tripe in allTripes: if tripe[0] == item1.lemma: tripe[0] = itemCopy1.lemma elif tripe[2] == item1.lemma: tripe[2] = itemCopy1.lemma if tripe[0] == item2.lemma: tripe[0] = itemCopy2.lemma elif tripe[2] == item2.lemma: tripe[2] = itemCopy2.lemma print("12345") resultList.append(tripe) print("最终结果") print(np.array(set([tuple(t) for t in resultList]))) else: print("最终结果") print(np.array(set([tuple(t) for t in allTripes]))) def dsfnStartCOO2(self, sentence, item1, item2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ print(item1.lemma) print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item print(sentence.to_string()) allTripes = self.dsfn1_2_3_4COO( sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO( sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO( sentence, item1, item2) if allTripes == None or len( allTripes) == 0: print("3333333") allTripes = self.dsfn5and6COO( sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: # if tripe[0] == item1.lemma: # tripe[0] = itemCopy1.lemma # elif tripe[2] == item1.lemma: # tripe[2] = itemCopy1.lemma # # if tripe[0] == item2.lemma: # tripe[0] = itemCopy2.lemma # elif tripe[2] == item2.lemma: # tripe[2] = itemCopy2.lemma resultList.append(tripe) print(np.array(set([tuple(t) for t in resultList]))) return resultList
from pyltp import Segmentor import os from tqdm import tqdm INPUT_PATH = '/home/brooksj/PycharmProjects/NLP12345/input' LTPMODEL_PATH = os.path.join(INPUT_PATH, 'ltp_data_v3.4.0') cws_model_path = os.path.join(LTPMODEL_PATH, 'cws.model') seg = Segmentor() seg.load_with_lexicon(cws_model_path, os.path.join(INPUT_PATH, 'lexicon_ex.txt')) with open('./wiki.zh.txt.jian', 'r') as rf, open('./wiki.zh.segs.txt', 'w') as wf: wiki = tqdm(iter(rf.readlines()), desc=u'已分词0篇文章') i = 0 for line in wiki: for sent in line.split('\s+'): words = list(seg.segment(sent)) wf.write(' '.join(words) + ' ') wf.write('\n') i += 1 if i % 100 == 0: wiki.set_description(u'已分词%d篇文章' % i)
import os from pyltp import Segmentor LTP_DATA_DIR='D:\python\ltp_data_v3.4.0' cws_model_path=os.path.join(LTP_DATA_DIR,'cws.model') segmentor=Segmentor() segmentor.load(cws_model_path) words=segmentor.segment('2019年,我国船舶工业以供给侧结构性改革为主线,不断推动行业向高质量发展转变。') print(type(words)) print('\t'.join(words)) segmentor.release() pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 words = ['元芳', '你', '怎么', '看'] # 分词结果 postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) postagger.release() # 释放模型
class ZHProcessor: """ This class is for processing xml for non-English languages (currently Chinese). dataset_unproc => dataset_whole. """ def __init__(self): self.dp_dataset_unproc = path_parser.dataset_unproc self.dp_dataset_whole = path_parser.dataset_whole self.fp_top_unproc = path_parser.dataset_top_unproc self.fp_des_unproc = path_parser.dataset_des_unproc self.fp_top = path_parser.dataset_top self.fp_des = path_parser.dataset_des self.segmentor = Segmentor() self.segmentor.load(path_parser.cws) self.SIDE_PATTERN = '(?<=#s-{0}\n)[\s\S]*?(?=\n#e-{0})' # logger.info('CWS model fp: {0}'.format(path_parser.cws)) @deprecated def stanford_stuff(self): # from stanfordcorenlp import StanfordCoreNLP # from nltk.parse.corenlp import CoreNLPTokenizer # import corenlp # self.nlp = StanfordCoreNLP('http://localhost', # port=9000, # timeout=30000) # self.nlp = StanfordCoreNLP(os.environ["CORENLP_HOME"], lang='zh', memory='4g') # self.props = { # 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation', # 'pipelineLanguage': 'zh', # 'outputFormat': 'json' # } # seg = StanfordSegmenter() # seg.default_config('zh') # sent = u'这是斯坦福中文分词器测试' # print(seg.segment(sent)) # with self.nlp as nlp: # for sent in sents: # print(sent) # print(nlp.word_tokenize(sent)) # with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client: # ann = client.annotate(text) # # sentence = ann.sentence[0] # assert corenlp.to_text(sentence) == text # print(sentence.text) # token = sentence.token[0] # print(token.lemma) pass @deprecated def sent2words(self, segmentor, sent): words = segmentor.segment(sent) logger.info('|'.join(words)) def para2sents(self, paragraph): for sent in re.findall('[^!?。\.\!\?]+[!?。\.\!\?]?', paragraph, flags=re.U): yield sent def proc_content(self, content, is_headline, use_sent_seg=True, convert2simple=False): if use_sent_seg: content = self.para2sents(content) proc_lines = list() for sent in content: proc_lines.append('#s-sent') if convert2simple: sent = Converter('zh-hans').convert(sent) words = self.segmentor.segment(sent) # logger.info('words: {0}'.format(words)) # logger.info('|'.join(words)) proc_lines.append('\n'.join(words)) proc_lines.append('#e-sent') if is_headline: proc_lines.insert(0, '#s-headline') proc_lines.append('#e-headline') else: proc_lines.insert(0, '#s-para') proc_lines.append('#e-para') return proc_lines def release_seg(self): self.segmentor.release() def get_xml_elements(self, xml_fp): def _get_para_info(): para_matches = list(re.finditer(re.compile(para_pattern), text)) if not para_matches: logger.error('No para in {0}'.format(xml_fp)) raise AssertionError # logger.info('para_matches {0}'.format(para_matches)) paras = list() para_spans = list() for para_m in para_matches: # if para_m.group() != '\n': paras.append(para_m.group()) para_spans.append(para_m.span()) # logger.info('paras: {0}'.format(paras)) # logger.info('para_spans: {0}'.format(para_spans)) para_info = list(zip(paras, para_spans)) # logger.info('para_info {0}'.format(para_info)) return para_info def _get_ind_headline_info(): ind_headline_info = list() headline_matches = list( re.finditer(re.compile(headline_pattern), text)) if headline_matches: headlines = list() headline_spans = list() for headline_m in headline_matches: # if headline_m.group() != '\n': headlines.append(headline_m.group()) headline_spans.append(headline_m.span()) headline_info = list(zip(headlines, headline_spans)) # logger.info('headline_info {0}'.format(headline_info)) for h_info in headline_info: h_start, h_end = h_info[1] in_para = False for p_info in para_info: p_start, p_end = p_info[1] if p_start <= h_start and h_end <= p_end: in_para = True # logger.info('headline in para ...') if not in_para: ind_headline_info.append(h_info) # logger.info('ind_headline_info {0}'.format(ind_headline_info)) return ind_headline_info def _sort_paras_and_headlines(): sorted_items = deepcopy(list(para_info)) if ind_headline_info: for ind_h_info in ind_headline_info: ind_h_start = ind_h_info[1][0] p_span_starts = [p_info[1][0] for p_info in para_info] # logger.info('p_span_starts: {0}'.format(p_span_starts)) insert_idx = None for idx, p_span_start in enumerate(p_span_starts): if ind_h_start < p_span_start: insert_idx = idx break item_dict = {'content': ind_h_info[0], 'is_headline': True} sorted_items.insert(insert_idx, item_dict) # deal with all paras left for idx, item in enumerate(sorted_items): if type(item) != dict: item_dict = {'content': item[0], 'is_headline': False} sorted_items[idx] = item_dict return sorted_items def _handle_nested_paras(): for idx, item in enumerate(sorted_items): if item['is_headline']: continue headline_matches = list( re.finditer(re.compile(headline_pattern), item['content'])) # headline_match = re.search(re.compile(headline_pattern), item['content']) if not headline_matches: continue new_items = list() for headline_m in headline_matches: inner_headline_item = { 'content': headline_m.group(), 'is_headline': True, } new_items.insert(0, inner_headline_item) rest_pattern = '(?<=\</h>\n)[\s\S]*' rest_match = re.search(re.compile(rest_pattern), item['content']) if rest_match: # logger.error('No rest in para: {0} of {1}'.format(item['content'], xml_fp)) # raise AssertionError rest_para_item = { 'content': rest_match.group(), 'is_headline': False, } new_items.insert(0, rest_para_item) del sorted_items[idx] for new_item in new_items: sorted_items.insert(idx, new_item) root_pattern = '(?<=\<{0}>\n)[\s\S]*?(?=\n</{0}>)' para_pattern = root_pattern.format('p') headline_pattern = root_pattern.format('h') with io.open(xml_fp, encoding='utf-8', errors='ignore') as f: text = f.read() # logger.info('text: {0}'.format(text)) para_info = _get_para_info() ind_headline_info = _get_ind_headline_info() sorted_items = _sort_paras_and_headlines() _handle_nested_paras() return sorted_items def dump_files(self, fp, text): with io.open(fp, mode='a', encoding='utf-8') as f: f.write(text) def proc_xml(self, xml_fp, out_fp): sorted_elements = self.get_xml_elements(xml_fp) proc_lines = list() # logger.info('sorted_elements: {0}'.format(sorted_elements)) for element in sorted_elements: element_lines = self.proc_content(**element) # logger.info('element_lines: {0}'.format(element_lines)) proc_lines.extend(element_lines) proc_lines.insert(0, '#s-doc') proc_lines.append('#e-doc') out_text = '\n'.join(proc_lines) self.dump_files(fp=out_fp, text=out_text) def proc_all_docs(self): xml_root = self.dp_dataset_unproc fns = [fn for fn in listdir(xml_root) if isfile(join(xml_root, fn))] for fn in tqdm(fns): xml_fp = join(xml_root, fn) out_fp = join(self.dp_dataset_whole, fn) self.proc_xml(xml_fp=xml_fp, out_fp=out_fp) def proc_side_top(self): proc_lines = list() with io.open(self.fp_top_unproc, encoding='utf-8') as f: for dom in doms_final: pattern = re.compile(self.SIDE_PATTERN.format(dom)) topics = re.findall(pattern, f.read())[0].split('\n') logger.info('topics: {0}'.format(topics)) top_proc_lines = self.proc_content(topics, is_headline=False, use_sent_seg=False) top_proc_lines.insert(0, '#s-{0}'.format(dom)) top_proc_lines.append('#e-{0}'.format(dom)) proc_lines.extend(top_proc_lines) f.seek(0, 0) with io.open(self.fp_top, mode='a', encoding='utf-8') as f: f.write('\n'.join(proc_lines)) def proc_side_des(self): proc_lines = list() with io.open(self.fp_des_unproc, encoding='utf-8') as f: for dom in doms_final: pattern = re.compile(self.SIDE_PATTERN.format(dom)) des_sents = re.findall(pattern, f.read())[0].split('\n') des_proc_lines = self.proc_content(des_sents, is_headline=False, use_sent_seg=False) des_proc_lines.insert(0, '#s-{0}'.format(dom)) des_proc_lines.append('#e-{0}'.format(dom)) proc_lines.extend(des_proc_lines) f.seek(0, 0) with io.open(self.fp_des, mode='a', encoding='utf-8') as f: f.write('\n'.join(proc_lines))
def main(argv): parser = argparse.ArgumentParser(description='...') parser.add_argument('-d','--domain',default='AISpeech',action='store',help='which domain: AISpeech or SpeechLab') parser.add_argument('-w','--weight',default=-1,action='store',metavar='number',type=float,help='weight number') parser.add_argument('--test',action='store_true') args = parser.parse_args() lex_file = open(os.path.join(PATH_TO_DATA[args.domain],'rules.txt'), 'r') weight = args.weight if not args.test: out_lex_file = open(os.path.join(PATH_TO_DATA[args.domain],'rules.release.txt'), 'w') else: out_lex_file = open(os.path.join(PATH_TO_DATA[args.domain],'rules.test.release.txt'), 'w') cws_model_path = PATH_TO_SPLIT_WORDS # 分词模型路径,模型名称为`cws.model` dict_path = os.path.join(PATH_TO_DATA[args.domain], 'dict.txt') # 领域相关的词典,用于帮助分词 segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path,dict_path) # 加载模型 if concept_fst_dict!={}: concept_fst_dict.clear() if constraints_names!={}: constraints_names.clear() macro_patterns = {} all_patterns = [] for line in lex_file: line=line.strip() if line == '' or line.startswith('%'): continue if '=>' not in line: #规则宏 pat_name, pat = line.strip(';').split('=') macro_patterns['${'+pat_name+'}'] = extract_simple_rules(pat.strip(), macro_patterns) else: #正常规则 pattern, node_info = line.split('=>') chunk_list = extract_simple_rules(pattern.strip(), macro_patterns) all_patterns.append((chunk_list, node_info)) isyms = ["<eps>"] label_voc = {} osyms = ["<eps>", "<unk>"] word_voc = {} #["<unk>"] #<unk> should be defined manually for chunk_list,_ in all_patterns: for word in chunk_list: if word[0] not in ['(', ')', '|']: word = word.strip('?') word_voc[word] = 1 osyms = osyms + list(word_voc) osyms_table = fst.SymbolTable() for idx,val in enumerate(osyms): osyms_table[val] = idx isyms_table = fst.SymbolTable() for idx,val in enumerate(isyms): isyms_table[val] = idx for pattern_idx, (pattern_chunk_list, node_info) in enumerate(all_patterns): # unique_rules = set() replace_mapping_dict = {} concept_fst = fst.StdTransducer(isyms=isyms_table, osyms=osyms_table) segment_stack = [{'start_of_this_segment':0, 'end_of_this_segment':0}] segment_stack[0]['value'] = '<eps>' cursor_head, cursor_tail = 0, 1 argument_count = 0 # print('Processing rule',pattern_chunk_list,'=>',node_info) for word in pattern_chunk_list: if word == '(': argument_count += 1 segment_stack.append({'start_of_this_segment':cursor_tail, 'end_of_this_segment':0, 'value':segment_stack[-1]['value']}) segment_stack[-1]['head_arc'] = [cursor_head, cursor_tail] cursor_tail += 1 cursor_head = cursor_tail - 1 elif word[0] == ')': if segment_stack[-1]['end_of_this_segment'] == 0: segment_stack[-1]['end_of_this_segment'] = cursor_head else: concept_fst.add_arc(cursor_head, segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>') cursor_head = segment_stack[-1]['end_of_this_segment'] if word == ')?': concept_fst.add_arc(segment_stack[-1]['head_arc'][0], segment_stack[-1]['head_arc'][1], '<eps>', '<eps>') concept_fst.add_arc(segment_stack[-1]['start_of_this_segment'], segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>') else: concept_fst.add_arc(segment_stack[-1]['head_arc'][0], segment_stack[-1]['head_arc'][1], '<eps>', '<eps>') segment_stack.pop() elif word == '|': if segment_stack[-1]['end_of_this_segment'] == 0: segment_stack[-1]['end_of_this_segment'] = cursor_head else: concept_fst.add_arc(cursor_head, segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>') cursor_head = segment_stack[-1]['start_of_this_segment'] else: if word[-1] == '?': concept_fst.add_arc(cursor_head, cursor_tail, '<eps>', '<eps>') word = word[:-1] else: pass next_state = add_arc(concept_fst, cursor_head, cursor_tail, word, segment_stack[-1]['value']) cursor_head = cursor_tail cursor_tail = next_state if segment_stack[-1]['end_of_this_segment'] == 0: segment_stack[-1]['end_of_this_segment'] = cursor_head else: concept_fst.add_arc(cursor_head, segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>') final_state_idx = segment_stack[-1]['end_of_this_segment'] concept_fst[final_state_idx].final = True concept_fst = concept_fst.inverse() concept_fst = concept_fst.determinize() concept_fst.minimize() concept_fst = concept_fst.inverse() t = concept_fst paths=list(t.paths()) random.shuffle(paths) if not args.test: if extract_proper_num(len(paths))>len(paths): paths=paths*(extract_proper_num(len(paths))//len(paths))+paths[:extract_proper_num(len(paths))%len(paths)] else: paths=paths[:extract_proper_num(len(paths))] else: paths=paths[:2] if len(paths)>=2 else paths for output in paths: raw_path = [] for arc in output: raw_path.append((t.osyms.find(arc.olabel), t.isyms.find(arc.ilabel))) path = raw_path input_seq = [] output_seq = [] for word, label in path: if word not in ['<eps>', u"ε"]: input_seq.append(word) if label not in ['<eps>', u"ε"]: if label == '_' and word not in ['<eps>', u"ε"]: output_seq.append(word) elif label != '_': output_seq.append(label) pattern = input_seq sentence = [item if item[0] != '$' else ',' for item in pattern] tags = [item for item in pattern if item[0] == '$'] sentence = ''.join(sentence) words = segmentor.segment(sentence) new_words = [] tag_idx = 0 for word in words: word = word if word == ',': word = tags[tag_idx] tag_idx += 1 new_words.append(word) new_rule_simple = ' '.join(new_words)+' => '+node_info out_lex_file.write(new_rule_simple+'\n')
'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 words = segmentor.segment( '威尔士柯基犬是一种小型犬,它们的胆子很大,也相当机警,能高度警惕地守护家园,是最受欢迎的小型护卫犬之一。') # 分词 print('\t'.join(words)) postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 # words = ['元芳', '你', '怎么', '看'] # postags = ['nh', 'r', 'r', 'v'] arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # words = ['元芳', '你', '怎么', '看'] # postags = ['nh', 'r', 'r', 'v'] # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果
# 这里分词十分垃圾,分完之后对于目标词还是要进行重组 news = open('./cleanNews.txt', 'r', encoding='utf-8').readlines() segednews = open('./segedNews.txt', 'w', encoding='utf-8') # 辅助分词很关键,千万别把 沙曼维亚济 分成 沙曼 维亚济 import os LTP_DIR = 'D:\Python-dev\ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DIR, 'cws.model') # 分词模型路径 from pyltp import Segmentor segmentor = Segmentor() # 初始化 segmentor.load_with_lexicon(cws_model_path, './NE.txt') # 辅助分词 for idx, sent in enumerate(news): segSent = list(segmentor.segment(sent)) # ['国家主席', '江泽民', '访问', '了', '美国'] news[idx] = segSent # ['国家主席江泽民访问了美国']->[['国家主席', '江泽民', '访问', '了', '美国']] for n in news: for word in n: segednews.write(word + ' ') segednews.write('\n') segednews.close() # 这个垃圾分词,用了外部词典还是把不该分的分了!!!! import re news = open('./segedNews.txt', 'r', encoding='utf-8').readlines() # 为 企业 改革 发展 建功立业 本报 北京 讯 中华 全国 总工会 今 发出 致 全国 各族 职工 慰问信 更加 紧密 地 团结 在 以 江泽民 同志 为 核心 的 党中央 周围 # 中华全国总工会被分词了,玩个屁 # 我知道了,用正则? none or once ' '就可以抢救回来!!!! relations = open('./relation_pos_neg.txt', 'r', encoding='utf-8').readlines()
from utils import inout import index if __name__ == '__main__': segmentor = Segmentor() segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) infoList = inout.readListFromTxt('./dn_test.txt') for sentence in infoList: # segmentor.load(inout.getLTPPath(index.CWS)) words = segmentor.segment(sentence) postags = postagger.postag(words) # result = zip(words,postags) # inout.printEscapeStr(result) segmentor.release() postagger.release() # recognizer = NamedEntityRecognizer() # recognizer.load(inout.getLTPPath(index.NER)) # netags = recognizer.recognize(words, postags) # recognizer.release() # result = zip(words,postags) # inout.printEscapeStr(result)
with open('../data/cont.txt', 'rb') as f: encoding = chardet.detect(f.readline()) print(encoding) with open('../data/cont.txt','r',encoding='utf8') as f: content = f.read() print(content) for line in content.split('\n'): print(line) print('----') seg = Segmentor() seg.load(model_path) words = seg.segment(content) seg.release() pos = Postagger() pos.load(pos_path) postag = pos.postag(words) pos.release() union = list(zip(list(words),list(postag))) union_list = [x+' :'+y for x,y in union] ner_path = os.path.abspath('./coach/ltp_data_v3.4.0/ner.model') recognizer = NamedEntityRecognizer() recognizer.load(ner_path) # print(list(words))
class LtpParser: def __init__(self): # ltp 模型路径 LTP_DATA_DIR = './ltp_data' # 分词模型 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DATA_DIR, 'cws.model'), 'ltp_data/lexicon.txt') # self.segmentor.load(os.path.join(LTP_DATA_DIR,'cws.model')) # 词性标注模型 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DATA_DIR, 'parser.model')) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) # 语义角色标注 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DATA_DIR, 'pisrl_win.model')) def format_label_role(self, words, postags): """ 语义角色标注 :param self: :param words: :param postags: :return: """ arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict def build_parse_child_dict(self, words, postags, arcs): """ 句法分析---为句子的每个词语维护一个保存语法依存儿子节点的字典 :param words: :param postags: :param arcs: :return: """ child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: # arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] for i in range(len(words)): a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_label_role(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
#coding:utf-8 import sys, os import json import re import pynlpir pynlpir.open() from pyltp import Segmentor, Postagger, Parser,NamedEntityRecognizer, SementicRoleLabeller ROOTDIR =os.path.join(os.path.dirname(__file__),os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) #设置模型文件的路径 MODELDIR=os.path.join(ROOTDIR, "ltp_data") path = os.path.abspath(os.path.dirname(sys.argv[0])) path_in = path+'/car_review_split.txt' content_in = open(path_in,'r') path_out = path+'/test_word_list2.txt' content_out = open(path_out,'w') segmentor = Segmentor() segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt") for line in content_in.readlines()[5000:10000]: print line line = re.sub("[\.\!\/_,$%^*(+\"\' ]+|[+——!,。?、~@#¥%……&*()]+".decode('utf-8'),"".decode('utf-8'),line.decode('utf-8')) line = line.encode('utf-8').strip() words = segmentor.segment(line) for j in words: content_out.write(j+' ') content_out.close()
class Extractor(): def __init__(self): self.__clause_list = [] self.__subclause_dict = {} self.__triple_list = [] self.__segmentor = Segmentor() self.__postagger = Postagger() self.__recognizer = NamedEntityRecognizer() self.__parser = Parser() self.__labeller = SementicRoleLabeller() self.__words_full_list = [] self.__netags_full_list = [] @property def clause_list(self): return self.__clause_list @property def triple_list(self): return self.__triple_list def split(self, words, postags): start = 0 for j, w in enumerate(words): if w == ',' or w == ',' or w == '。': clause = Clause(start, j - 1) self.__clause_list.append(clause) start = j + 1 for clause in self.__clause_list: clause.split(postags) for subclause in clause.sub_clause_list: self.add_inverted_idx(subclause) def add_inverted_idx(self, subclause): for i in range(subclause.start_idx, subclause.end_idx): self.__subclause_dict[i] = subclause def load(self): PATH = '' self.__segmentor.load(PATH + 'cws.model') self.__postagger.load(PATH + 'pos.model') self.__recognizer.load(PATH + 'ner.model') self.__parser.load(PATH + 'parser.model') self.__labeller.load(PATH + 'pisrl.model') def release(self): self.__segmentor.release() self.__postagger.release() self.__recognizer.release() self.__parser.release() self.__labeller.release() def clear(self): self.__triple_list = [] self.__words_full_list = [] self.__netags_full_list = [] def resolve_conference(self, entity): try: e_str = entity.get_content_as_str() except Exception: return '?' ref = e_str if e_str == '他' or e_str == '她': for i in range(entity.loc, -1, -1): if self.__netags_full_list[i].lower().endswith('nh'): ref = self.__words_full_list[i] break return ref def resolve_all_conference(self): for t in self.triple_list: e_str = self.resolve_conference(t.entity_1) try: t.entity_1.content = e_str.split() except Exception: pass def chunk_str(self, data): sents = SentenceSplitter.split(data) offset = 0 for sent in sents: try: words = self.__segmentor.segment(sent) postags = self.__postagger.postag(words) netags = self.__recognizer.recognize(words, postags) arcs = self.__parser.parse(words, postags) roles = self.__labeller.label(words, postags, netags, arcs) self.chunk_sent(list(words), list(postags), list(arcs), offset) offset += len(list(words)) self.__words_full_list.extend(list(words)) self.__netags_full_list.extend(list(netags)) except Exception as e: print(str(e)) pass def chunk_sent(self, words, postags, arcs, offset): root = [i for i, x in enumerate(arcs) if x.relation == 'HED'] if len(root) > 1: raise Exception('More than 1 HEAD arc is detected!') root = root[0] relations = [ i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO' ] relations.insert(0, root) prev_e1 = None e1 = None for rel in relations: left_arc = [ i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV' ] if len(left_arc) > 1: pass # raise Exception('More than 1 left arc is detected!') elif len(left_arc) == 0: e1 = prev_e1 elif len(left_arc) == 1: left_arc = left_arc[0] leftmost = find_farthest_att(arcs, left_arc) e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost) prev_e1 = e1 right_arc = [ i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB' ] e2_list = [] if not right_arc: e2 = Entity(2, None) e2_list.append(e2) else: right_ext = find_farthest_vob(arcs, right_arc[0]) items = [ i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO' ] items = right_arc + items count = 0 for item in items: leftmost = find_farthest_att(arcs, item) e2 = None if count == 0: e2 = Entity( 2, [words[i] for i in range(leftmost, right_ext + 1)], offset + leftmost) else: p1 = range(leftmost, right_arc[0]) p2 = range(item, find_farthest_vob(arcs, item) + 1) e2 = Entity( 2, [words[i] for i in itertools.chain(p1, p2)]) e2_list.append(e2) r = Relation(words[rel]) t = Triple(e1, e2, r) self.__triple_list.append(t) count += 1
class LtpParser(): def __init__(self): LTP_DIR = "./ltp_data" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) '''长句切分''' def seg_long_sents(self, content): return [ sentence for sentence in re.split( r'[??!!。\n\r]', content.replace(' ', '').replace('\u3000', '').replace( '——', '')) if sentence ] '''ltp基本操作''' def basic_parser(self, words): postags = list(self.postagger.postag(words)) netags = self.recognizer.recognize(words, postags) return postags, netags '''基于实体识别结果,整理输出实体列表''' def format_entity(self, words, netags): name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word) elif ntag[-2:] == "Ni": organization_entity_list.append(word) else: place_entity_list.append(word) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word else: ntag_E_Ns = ntag_E_Ns + word elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word else: ntag_E_Ns = ntag_E_Ns + word else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 return place_entity_list '''获取地点''' def collect_locations(self, content): locations = [] sents = self.seg_long_sents(content) for i in sents: words = list(self.segmentor.segment(i)) postags, netags = self.basic_parser(words) locations += self.format_entity(words, netags) return locations
class LTP: def __init__( self, ltp_data_path=None, seg_lexicon=None, pos_lexicon=None, ): if not ltp_data_path: raise ValueError('请指定ltp用到的模型所在路径!!!') self.ltp_data_path = ltp_data_path # ltp模型目录的路径 self._cws_model_path = os.path.join( self.ltp_data_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` self._pos_model_path = os.path.join( self.ltp_data_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self._ner_model_path = os.path.join( self.ltp_data_path, 'parser.model') # 命名实体识别模型路径,模型名称为`pos.model` self._segmentor = Segmentor() # 初始化实例 if seg_lexicon: self._segmentor.load_with_lexicon( self._cws_model_path, seg_lexicon) # 加载模型,第二个参数是您的外部词典文件路径 else: self._segmentor.load(self._cws_model_path) self._postagger = Postagger() # 初始化实例 if pos_lexicon: self._postagger.load_with_lexicon( self._pos_model_path, pos_lexicon) # 加载模型,第二个参数是您的外部词典文件路径 else: self._postagger.load(self._pos_model_path) self._recognizer = NamedEntityRecognizer() # 初始化实例 self._recognizer.load(self._ner_model_path) # 加载模型 def cut(self, text): return self._segmentor.segment(text) def pos(self, text): words = self.cut(text) postags = self._postagger.postag(words) return zip(words, postags) def ner(self, text): """ 命名实体识别,提供三种命名识别,PER人名、LOC地名、ORG机构名 :param text: :return: """ # Nh代表人名, Ni代表机构名,Ns代表地点名字 ner_dict = {'Nh': [], 'Ni': [], 'Ns': []} words = self.cut(text) postags = self._postagger.postag(words) nertags = self._recognizer.recognize(words, postags) ner_tmp = [] for i, tag in enumerate(nertags): if tag == 'O': continue if tag.startswith('S'): tag = tag.split('-')[-1] ner_dict[tag].append(words[i]) elif tag.startswith('B') or tag.startswith('I'): ner_tmp.append(words[i]) continue elif tag.startswith('E'): ner_tmp.append(words[i]) tag = tag.split('-')[-1] ner_dict[tag].append(''.join(ner_tmp)) ner_tmp = [] if ner_tmp: tag = list(nertags)[-1] tag = tag = tag.split('-')[-1] ner_dict[tag].append(''.join(ner_tmp)) ner_map = dict() ner_map['PER'] = ner_dict['Nh'] ner_map['ORG'] = ner_dict['Ni'] ner_map['LOC'] = ner_dict['Ns'] return ner_map def release(self): self._segmentor.release() self._recognizer.release() self._postagger.release()
# -*- coding: utf-8 -*- from pyltp import SentenceSplitter from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer ldir = 'AgriKG\\ltp\\cws.model' #分词模型 dicdir = 'word' #外部字典 text = "贵州财经大学要举办大数据比赛吗?那让欧几里得去问问看吧!其实是在贵阳花溪区吧。" #中文分词 segmentor = Segmentor() #初始化实例 segmentor.load_with_lexicon(ldir, 'word') #加载模型 words = segmentor.segment(text) #分词 print(' '.join(words)) #分词拼接 words = list(words) #转换list print(u"分词:", words) segmentor.release()
f.close() #统计有多少篇新闻 newscnt = 0 for i in range(0, 100): if len(new[99-i]) != 0: newscnt = 100-i break ''' # 读入所有的标题,计算其向量置于title中 title = [] f = open(unicode('../Sentence/sentence/' + news + '/title.txt', 'utf8'), 'r') for line in f: words = segmentor.segment(line.strip()) word_vec_list = [] for word in words: if word not in stoplist and word in model: word_vec_list.append(model[word]) title.append(mean_vec(word_vec_list)) f.close() print '标题数:', len(title) # 读入所有标签 f = open(unicode('../Sentence/label/' + news + '/label.txt', 'utf8'), 'r') labels = [line.strip().replace('+', '') for line in f] f.close() for label in labels:
class LTPFunction: def __init__(self): self.segmentor = Segmentor() self.segmentor.load("model/cws.model") # self.segmentor.load_with_lexicon("model/cws.model", 'dict/segdict.txt') # 加载模型,第二个参数是您的外部词典文件路径 self.postagger = Postagger() # 初始化实例 self.postagger.load('model/pos.model') # 加载模型 self.parser = Parser() # 初始化实例 self.parser.load('model/parser.model') # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load('model/ner.model') def __new__(cls, *args, **kwargs): if not hasattr(cls, 'instance'): cls.instance = super(LTPFunction, cls).__new__(cls) return cls.instance def ltp_seg(self, sentence): words = self.segmentor.segment(sentence) return [i for i in words] # 词性标注, 输入为分词后的列表, 输出为词性标注列表 def ltp_pos(self, word_list): # print(type(word_list)) words_postags = self.postagger.postag(word_list) # 词性标注 # postagger.release() return [i for i in words_postags] # 实体抽取, 输入为分词列表、词性标注列表, 输出为人名集合、地名集合、机构名集合 def ltp_ner(self, word_list, words_postags): netags = self.recognizer.recognize(word_list, words_postags) # print(" ".join(netags)) entity = '' tag = '' person_set = set() location_set = set() organization_set = set() for i in range(len(netags)): ner = netags[i].split('-') if ner[0] == 'O': if entity != '': if tag == 'Nh': person_set.add(entity) if tag == 'Ns': location_set.add(entity) if tag == 'Ni': organization_set.add(entity) entity = '' tag = '' elif ner[0] == 'S': if ner[1] == 'Nh': person_set.add(word_list[i]) if ner[1] == 'Ns': location_set.add(word_list[i]) if ner[1] == 'Ni': organization_set.add(word_list[i]) elif ner[0] == 'B': entity = entity + word_list[i] tag = ner[1] elif ner[0] == 'I': entity = entity + word_list[i] tag = ner[1] else: entity = entity + word_list[i] tag = ner[1] if tag == 'Nh': person_set.add(entity) if tag == 'Ns': location_set.add(entity) if tag == 'Ni': organization_set.add(entity) entity = '' tag = '' return person_set, location_set, organization_set # 句法分析, 输入为分词列表、词性标注列表, 输出为关系(Relation)列表、父节点(Head)列表 def ltp_parser(self, word_list, pos_list): relation_list = [] head_list = [] arcs = self.parser.parse(word_list, pos_list) # 句法分析 for arc in arcs: relation_list.append(arc.relation) head_list.append(arc.head) return relation_list, head_list # 外部 def ner_extract(self, title, content): # 最终得到的去重后的结果 person_set = set() location_set = set() organization_set = set() # 根据title获取对应的信息 title_words = self.ltp_seg(title) title_pos = self.ltp_pos(title_words) p_set, l_set, o_set = self.ltp_ner(title_words, title_pos) # 将获取的集合放入结果中 person_set = person_set | p_set location_set = location_set | l_set organization_set = organization_set | o_set ''' # 根据content获取对应的信息(分句后再处理) for sentence in re.split(r'[??!!。;;::\n\r]', content): if sentence: # print(sentence) sen_words = self.ltp_seg(sentence) sen_pos = self.ltp_pos(sen_words) p_set, l_set, o_set = self.ltp_ner(sen_words, sen_pos) # 将获取的集合放入结果中 person_set = person_set | p_set location_set = location_set | l_set organization_set = organization_set | o_set ''' return list(person_set), list(location_set), list(organization_set)
class LTP(object): def __init__(self): cws_model_path = os.path.join('../data/ltp_data_v3.4.0', 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join('../data/ltp_data_v3.4.0', 'pos.model') # 词性标注模型路径,模型名称为`pos.model` ner_model_path = os.path.join( '../data/ltp_data_v3.4.0', 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 # 分词 def segment(self, text): words = list(self.segmentor.segment(text)) return words # 词性标注 def postag(self, words): postags = list(self.postagger.postag(words)) return postags # 获取文本中的时间 def get_time(self, text): # 开始分词及词性标注 words = self.segment(text) #print(words) postags = self.postag(words) #print(postags) time_lst = [] i = 0 for tag, word in zip(postags, words): if tag == 'nt': j = i while postags[j] == 'nt' or words[j] in ['至', '到']: j += 1 time_lst.append(''.join(words[i:j])) i += 1 # 去重子字符串的情形 remove_lst = [] for i in time_lst: for j in time_lst: if i != j and i in j: remove_lst.append(i) text_time_lst = [] for item in time_lst: if item not in remove_lst: text_time_lst.append(item) # print(text_time_lst) return text_time_lst #提取人名地名组织名 def get_name(self, text): persons, places, orgs = set(), set(), set() words = self.segment(text) #print("words333333333333") postags = self.postag(words) #print(postags) netags = list(self.recognizer.recognize(words, postags)) # 命名实体识别 #print(netags) # print(netags) i = 0 for tag, word in zip(netags, words): j = i # 人名 if 'Nh' in tag: if str(tag).startswith('S'): persons.add(word) elif str(tag).startswith('B'): union_person = word while netags[j] != 'E-Nh': j += 1 if j < len(words): union_person += words[j] persons.add(union_person) # 地名 if 'Ns' in tag: if str(tag).startswith('S'): places.add(word) elif str(tag).startswith('B'): union_place = word while netags[j] != 'E-Ns': j += 1 if j < len(words): union_place += words[j] places.add(union_place) # 机构名 if 'Ni' in tag: if str(tag).startswith('S'): orgs.add(word) elif str(tag).startswith('B'): union_org = word while netags[j] != 'E-Ni': j += 1 if j < len(words): union_org += words[j] orgs.add(union_org) i += 1 # print('人名:', ','.join(persons)) # print('地名:', ','.join(places)) # print('组织机构:', ','.join(orgs)) return persons, places, orgs # 释放模型 def free_ltp(self): self.segmentor.release() self.postagger.release()
class LtpParser(): def __init__(self): LTP_DIR = os.path.join(pwd_path, "../ltp_data_v3.4.0") self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) '''ltp基本操作''' def basic_parser(self, words): postags = list(self.postagger.postag(words)) netags = self.recognizer.recognize(words, postags) return postags, netags '''ltp获取词性''' def get_postag(self, words): return list(self.postagger.postag(words)) '''基于实体识别结果,整理输出实体列表''' def format_entity(self, words, netags, postags): name_entity_dist = {} name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word + '_%s ' % index) elif ntag[-2:] == "Ni": organization_entity_list.append(word + '_%s ' % index) else: place_entity_list.append(word + '_%s ' % index) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh') name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni') name_entity_dist['nss'] = self.modify_entity(place_entity_list, words, postags, 'ns') return name_entity_dist '''entity修正,为rebuild_wordspostags做准备''' def modify_entity(self, entity_list, words, postags, tag): entity_modify = [] if entity_list: for entity in entity_list: entity_dict = {} subs = entity.split(' ')[:-1] start_index = subs[0].split('_')[1] end_index = subs[-1].split('_')[1] entity_dict['stat_index'] = start_index entity_dict['end_index'] = end_index if start_index == entity_dict['end_index']: consist = [ words[int(start_index)] + '/' + postags[int(start_index)] ] else: consist = [ words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index) + 1) ] entity_dict['consist'] = consist entity_dict['name'] = ''.join( tmp.split('_')[0] for tmp in subs) + '/' + tag entity_modify.append(entity_dict) return entity_modify '''基于命名实体识别,修正words,postags''' def rebuild_wordspostags(self, name_entity_dist, words, postags): pre = ' '.join( [item[0] + '/' + item[1] for item in zip(words, postags)]) post = pre for et, infos in name_entity_dist.items(): if infos: for info in infos: post = post.replace(' '.join(info['consist']), info['name']) post = [ word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0] ] words = [tmp.split('/')[0] for tmp in post] postags = [tmp.split('/')[1] for tmp in post] return words, postags '''依存关系格式化''' def syntax_parser(self, words, postags): arcs = self.parser.parse(words, postags) words = ['Root'] + words postags = ['w'] + postags tuples = list() for index in range(len(words) - 1): arc_index = arcs[index].head arc_relation = arcs[index].relation tuples.append([ index + 1, words[index + 1], postags[index + 1], words[arc_index], postags[arc_index], arc_index, arc_relation ]) return tuples '''为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, tuples): child_dict_list = list() for index, word in enumerate(words): child_dict = dict() for arc in tuples: if arc[3] == word: if arc[-1] in child_dict: child_dict[arc[-1]].append(arc) else: child_dict[arc[-1]] = [] child_dict[arc[-1]].append(arc) child_dict_list.append([word, postags[index], index, child_dict]) return child_dict_list '''parser主函数''' def parser_main(self, words, postags): tuples = self.syntax_parser(words, postags) child_dict_list = self.build_parse_child_dict(words, postags, tuples) return tuples, child_dict_list '''基础语言分析''' def basic_process(self, sentence): words = list(self.segmentor.segment(sentence)) postags, netags = self.basic_parser(words) name_entity_dist = self.format_entity(words, netags, postags) words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags) return words, postags
def ws_data(self): f = open("pnn_annotated.txt", 'r') total_line = 0 orgin_attr = [0, 0, 0] judge_attr = [0, 0, 0] right = [0, 0, 0] segmentor = Segmentor() segmentor.load("cws.model") for line in f: total_line += 1 # print 'line has been read' value_num = [0, 0] result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print 'this line is %s' % (line) for i in ws_lst: classify = '' try: value = self.setiment_words[i] except: pass else: if value == 1: print 'positive word:%s' % i value_num[0] += 1 elif value == -1: print 'negative word:%s' % i value_num[1] += 1 if value_num[0] == 0 and value_num[1] == 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] == value_num[1] != 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] > value_num[1]: classify = 'positive' judge_attr[1] += 1 else: classify = 'negative' judge_attr[2] += 1 print value_num print 'classfiy result:%s' % classify # the count of original'emotion if result[0] == '0': orgin_attr[0] += 1 elif result[0] == '1': orgin_attr[1] += 1 else: orgin_attr[2] += 1 if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0): # print 'neutral' right[0] += 1 elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0): # print 'neutral' right[0] += 1 elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0): # print 'positive' right[1] += 1 elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0): # print 'negative' right[2] += 1 # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) print 'orgin\'s neutral, positive, negative' print orgin_attr print 'judge_attr neutral, positive, negative' print judge_attr print 'neutral, positive, negative' print right print (right[0] + right[1] + right[2]) print 'total_line %f\n' % total_line print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) segmentor.release()
class LanguageProcessor(object): def __init__(self, configure): self.system_logger = logging.getLogger("system_log") self._sentence_splitter = SentenceSplitter self._segmentor = Segmentor() self._segmentor.load_with_lexicon( configure.nlp_data_root + "/cws.model", configure.nlp_data_root + "/cws.tsv") self._segmentor_without_dictionary = Segmentor() self._segmentor_without_dictionary.load(configure.nlp_data_root + "/cws.model") self._postagger = Postagger() self._postagger.load(configure.nlp_data_root + "/pos.model") self._ner_recognizer = NamedEntityRecognizer() self._ner_recognizer.load(configure.nlp_data_root + "/ner.model") self._dependency_parser = Parser() self._dependency_parser.load(configure.nlp_data_root + "/parser.model") self._srl = SementicRoleLabeller() self._srl.load(configure.nlp_data_root + "/pisrl.model") self._stopwords_file = configure.nlp_data_root + "/stopwords.txt" self._stopwords_set = set([ tk.strip() for tk in codecs.open(self._stopwords_file, 'r', 'utf-8').read().splitlines() if tk.strip() != "" ]) self.entity_type_mapping_file = configure.entity_type_mapping_file self.entity_type_mapping = defaultdict() for line in codecs.open(self.entity_type_mapping_file, 'r', 'utf-8').read().splitlines(): elems = line.split("\t") if len(elems) != 2: log_str = "Format error in file [%s] !!!\n" % self.entity_type_mapping_file self.system_logger.error(log_str) sys.stderr.write(log_str) continue self.entity_type_mapping[int( elems[0])] = "<" + str(elems[0]) + "_" + elems[1].strip() + ">" self.all_entity_replacements = list(self.entity_type_mapping.values()) self.entity_type_exclusion_file = configure.entity_type_exclusion_file self.entity_type_exclusion_mapping = defaultdict() for line in codecs.open(self.entity_type_exclusion_file, 'r', 'utf-8').read().splitlines(): elems = line.split("\t") if len(elems) != 2: log_str = "Format error in file [%s] !!!\n" % self.entity_type_exclusion_file self.system_logger.error(log_str) sys.stderr.write(log_str) continue self.entity_type_exclusion_mapping[int( elems[0])] = "<" + str(elems[0]) + "_" + elems[1].strip() + ">" self.entity_type_exclusion_set = set( self.entity_type_exclusion_mapping.keys()) trie_tree, lexicon = generate_trie_tree(configure.nlp_data_root + "trust_list.tsv") self._lexicon = lexicon self._trie_tree = trie_tree self.entity_linker = EntityLinker() self.dialog_act_classifier = DialogActClassifier( configure.dialog_act_classifier_configure) self.emotion_classifier = EmotionClassifier( configure.emotion_classifier_configure) self.yes_no_classifier = YesNoClassifier( configure.attitude_classifier_configure) self.like_dislike_classifier = LikeDislikeClassifier( configure.attitude_classifier_configure) self.question_classifier = QuestionClassifier( configure.question_classifier_configure) self.question_response = "" self.noun_phrase_generator = noun_phrase_generator self.segmentor_plus = segmentor_plus self.turn_on = configure.turn_on def segment_chinese_sentence_without_dictionary(self, sentence): return list(self._segmentor_without_dictionary.segment(sentence)) def generate_query(self, raw_sentence): # LTP cannot handle whitespace, it will remove whitespace automatically. # Therefore, we have to replace whitespace with some 'safe' tokens # e.g., comma original_raw_sentence = raw_sentence spaces = {} raw_sentence = list(raw_sentence) for s in re.finditer(' ', original_raw_sentence): spaces[s.start()] = s assert raw_sentence[s.start()] == ' ' raw_sentence[s.start()] = ',' raw_sentence = ''.join(raw_sentence) splitted_sentences = list(SentenceSplitter.split(raw_sentence)) structured_sentences = [] sent_pos = 0 sent_index = 0 for one_sentence in splitted_sentences: sent_start = raw_sentence.index(one_sentence, sent_pos) sent_end = sent_start + len(one_sentence) sent_pos = sent_end tokens = list(self._segmentor.segment(one_sentence)) tokens = list( self._resegment(tokens, lexicon=self._lexicon, trie_tree=self._trie_tree)) postags = [None] * len(tokens) if "POS" in self.turn_on: postags = list(self._postagger.postag(tokens)) ners = [None] * len(tokens) if "POS" in self.turn_on and "NER" in self.turn_on: ners = list(self._ner_recognizer.recognize(tokens, postags)) arcs = [None] * len(tokens) if "POS" in self.turn_on and "DEP" in self.turn_on: arcs = self._dependency_parser.parse(tokens, postags) roles = [None] * len(tokens) if "POS" in self.turn_on and "DEP" in self.turn_on and "SRL" in self.turn_on: roles = list(self._srl.label(tokens, postags, arcs)) arcs = list(arcs) token_list = [] word_pos = 0 sentence_length = 0 for index, tk in enumerate(tokens): word_start = one_sentence.index(tk[0], word_pos) word_end = word_start + len(tk) word_pos = word_end # Recover token if tk == ',' and word_start + sent_start in spaces: tk = ' ' token = Token( index, tk, tk, word_start, word_end, postags[index], ners[index], arcs[index].head if arcs[index] is not None else None, arcs[index].relation if arcs[index] is not None else None, word_start + sent_start, word_end + sent_start, [], self._detect_stop_words(tk)) token_list.append(token) if token.pos == "wp" or len(tk.strip()) == 0: continue sentence_length += len(tk) if len(token_list) == 0: continue if roles != [None] * len(tokens): for role in roles: token = token_list[role.index] for arg in role.arguments: token.semantic_roles.append( (arg.name, arg.range.start, arg.range.end)) np_chunks = self._generate_np_chunks(token_list) # Recover sentence one_sentence = list(one_sentence) for s in spaces: if s >= sent_start and s < sent_end: n = s - sent_start assert one_sentence[n] == ',' one_sentence[n] = ' ' one_sentence = ''.join(one_sentence) sentence = Sentence(one_sentence, sent_start, sent_index, sentence_length, token_list, np_chunks) sent_index += 1 structured_sentences.append(sentence) return_query = Query(raw_sentence, splitted_sentences, structured_sentences) if "POS" in self.turn_on and "ATT" in self.turn_on: self._detect_attitude(return_query) if "POS" in self.turn_on and "QTY" in self.turn_on: self._detect_question_type(return_query) if "ACT" in self.turn_on: self._dialog_act_detector(return_query) if "EMO" in self.turn_on: self._detect_emotion_type(return_query) if "ENL" in self.turn_on: for sentence_index, sentence in enumerate( return_query.sentence_list): self._link_entity(sentence_index, return_query.sentence_list) return_query.full_entity_ids.extend(sentence.full_entity_ids) return_query.topic_entity_ids.extend(sentence.topic_entity_ids) return_query.normalized_text += sentence.normalized_text for entity_type, entity_list in sentence.type2entity.items(): return_query.type2entity[entity_type].extend(entity_list) if len(return_query.sentence_list) > 1: return_query.full_entity_ids = list( set(return_query.full_entity_ids)) return_query.topic_entity_ids = list( set(return_query.topic_entity_ids)) for entity_type in return_query.type2entity: return_query.type2entity[entity_type] = list( set(return_query.type2entity[entity_type])) if return_query.normalized_text in self.all_entity_replacements: return_query.single_entity = True return return_query def _detect_stop_words(self, word): return word.strip() in self._stopwords_set def _generate_np_chunks(self, token_list): return list(noun_phrase_generator(token_list)) def _dialog_act_detector(self, query): self.dialog_act_classifier.classify(query) query.map_dialog_act_to_sentence_index() def _detect_question_type(self, query): self.question_classifier.classify(query) def _detect_emotion_type(self, query): self.emotion_classifier.classify(query) def _detect_attitude(self, query): self.yes_no_classifier.classify(query) self.like_dislike_classifier.classify(query) def _link_entity(self, sentence_index, sentence_list): entity_mention = self.entity_linker.linking(sentence_index, sentence_list) sentence_list[sentence_index].update_entity( entity_mention, self.entity_type_mapping, self.entity_type_exclusion_set) sentence_list[sentence_index].normalize(self.entity_type_mapping) def _ploarity_detector(self, query): return [] def _resegment(self, tokens, lexicon=None, trie_tree=None): return self.segmentor_plus(tokens, lexicon=lexicon, trie_tree=trie_tree)
'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` ## 词性标注 from pyltp import Postagger from pyltp import Segmentor par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, 'dict1.txt') # 加载模型 from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 words = list(segmentor.segment(text)) # 分词 ## 词性标注 from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 print(words, '分词结果') postags = postagger.postag(words) # 词性标注 tags = list(postags) print(tags, "词性标注") ## 依存句法分析 from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型
class LtpParser: def __init__(self): LTP_DIR = "/Users/benkangchen/pyltp/model" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
trDemo = data.TextRank.TextRank() # 分析过程 i = 0 for rawline in f.readlines(): # 按行分析 rawline_json=json.loads(rawline) # 获取标题行 titleline=rawline_json['title'] # 获取实体 entity=set() eec=rawline_json["coreEntityEmotions"] for key in eec: entity.add(key["entity"]) # 获取标题分词 titleWords=segmentor.segment(titleline) # 创建标题集合(不重集合) titleWordsSet = (set)([]) # 标题行输出 titleCut = "TitleCut=" for w in titleWords: # 读入长度大于1的词 for sep in kickout: if sep in w: continue if len((str)(w))>1 : titleWordsSet.add(w) titleCut+=w+" " # 获取内容行 SC=rawline_json["content"].strip() content=rawline_json["title"].strip()+' '+rawline_json["content"].strip()
def cut_words_ltp(sentence): segmentor = Segmentor() segmentor.load(cws_model_path) words = segmentor.segment(sentence) segmentor.release() return list(words)
class LtpParser: def __init__(self): LTP_DIR = "ltp_data_v3.4.0" self.segmentor = Segmentor( model_path=os.path.join(LTP_DIR, "cws.model")) # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger( model_path=os.path.join(LTP_DIR, "pos.model")) # self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser(os.path.join(LTP_DIR, "parser.model")) # self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer( os.path.join(LTP_DIR, "ner.model")) # self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller( os.path.join(LTP_DIR, 'pisrl_win.model')) # self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for index, role in roles: roles_dict[index] = { name: [name, arg[0], arg[1]] for name, arg in role } # for role in roles: # roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for idx, (head, relation) in enumerate(arcs): if head == index + 1: #arcs的索引从1开始,head 表示依存弧的父节点的索引。root节点的索引是0,从第一个词开始索引依次为1,2,3,。。。relation表示依存弧的关系。 if relation in child_dict: child_dict[relation].append(idx) else: child_dict[relation] = [] child_dict[relation].append(idx) # for arc_index in range(len(arcs)): # if arcs[arc_index].head == index+1: #arcs的索引从1开始 arc. head 表示依存弧的父结点的索引。 ROOT 节点的索引是 0 ,第一个词开始的索引依次为1,2,3,···arc. relation 表示依存弧的关系。 # if arcs[arc_index].relation in child_dict: # child_dict[arcs[arc_index].relation].append(arc_index)#添加 # else: # child_dict[arcs[arc_index].relation] = []#新建 # child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) # 每个词对应的依存关系父节点和其关系 rely_id = [head for head, relation in arcs] # 提取依存父节点id relation = [relation for head, relation in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
def bayes(self): segmentor = Segmentor() segmentor.load("cws.model") f = open('data/a_4.txt', 'r') # f = open('pnn_annotated.txt', 'r') # neutral, positive, negative class_freq = [0,0,0] # neutral, positive, negative word_total_count_freq = [0, 0, 0] each_word_count = [{}, {}, {}] accu = [0, 0] print 'train_set' for line in f: result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print line # neutral if result[0] == '0': class_freq[0] += 1 for word in ws_lst: word_total_count_freq[0] += 1 if each_word_count[0].get(word) is not None: # print 'Not none' each_word_count[0][word] += 1 else: # print 'None' each_word_count[0][word] = 1 # positive elif result[0] == '1': class_freq[1] += 1 for word in ws_lst: word_total_count_freq[1] += 1 if each_word_count[1].get(word) is not None: # print 'Not none' each_word_count[1][word] += 1 else: # print 'None' each_word_count[1][word] = 1 # negative elif result[0] == '-1': class_freq[2] += 1 for word in ws_lst: word_total_count_freq[2] += 1 if each_word_count[2].get(word) is not None: # print 'Not none' each_word_count[2][word] += 1 else: # print 'None' each_word_count[2][word] = 1 # print class_freq # print word_total_count_freq # print each_word_count print 'total' total_class_count = class_freq[0] + class_freq[1] + class_freq[2] total_word_count = word_total_count_freq[0] + word_total_count_freq[1] + word_total_count_freq[2] print total_class_count # print total_word_count f.close() f1 = open('a_1.txt', 'r') # 中性 积极, , 消极 # neutral, positive, negative orgin = [0, 0, 0] # 本来有多少积极消极 judge = [0, 0, 0] # 判断出来了多少积极消极 judge_right = [0, 0, 0] print 'test_set_now' for line in f1: result = line.split('\t') # print result[1] ws_lst = segmentor.segment(result[1]) # print test_line[test_count] max = 0 tmp_result = 0 for test_iter in range(3): processed_wst = [] prob_this_class = 1 for test_word in ws_lst: if test_word not in processed_wst: prob_this_class *= (each_word_count[test_iter].get(test_word, 0) + 1.0) / float(word_total_count_freq[test_iter] + total_word_count) processed_wst.append(test_word) prob_this_class *= (float(class_freq[test_iter]) / float(total_class_count)) if prob_this_class > max: max = prob_this_class tmp_result = test_iter if tmp_result == 0: test_result = '0' judge[0] += 1 elif tmp_result == 1: test_result = '1' judge[1] += 1 elif tmp_result == 2: test_result = '-1' judge[2] += 1 if result[0] == test_result: accu[0] += 1 else: accu[1] += 1 if result[0] == '0': orgin[0] += 1 elif result[0] == '1': orgin[1] += 1 elif result[0] == '-1': orgin[2] += 1 if result[0] == '0' == test_result: judge_right[0] += 1 elif result[0] == '1' == test_result: judge_right[1] += 1 elif result[0] == '-1' == test_result: judge_right[2] += 1 # print 'result is %s'%test_result # print 'count are %d, %d'%(accu[0], accu[1]) # print 'accuracy so far: %f'%(float(accu[0]) / float(accu[0] + accu[1])) f1.close() print 'orgin' print orgin print 'judge' print judge print 'judge_right' print judge_right print 'total' print accu print 'accuracy this time is %f'%((float(accu[0]) / float(accu[0] + accu[1])))
def word_vec_case_set(cls, word_model_file, with_name=False, merge_by='mosaic'): """ 获取词向量特征集,认为词条最多10个词 如果以mosaic方式,每个词条被表示为50*10=500维 如果以sum方式,每个词条被表示为50维 :param word_model_file: 词向量模型文件 :param with_name: 正样例是否包含人名 :param merge_by: 词条中词项量的结合方式,mosaic或sum :return: 一个字典{pos_case:{正例},neg:{负例}} """ segmentor = Segmentor() segmentor.load("../word2vec_process/model/cws.model") word_vec_model = word2vec.Word2Vec.load('../word2vec_process/model/' + word_model_file) case_dict = cls.load_case_set(with_name) word_vec_case_dict = {} if merge_by == 'mosaic': # 以词向量拼接的方式构建词条表示,500维 pos_case_list = case_dict['pos_case'] pos_case_vec_dict = {} for pos_case in pos_case_list: case_words = segmentor.segment(pos_case) case_vec = [] is_useful = 0 for word in case_words: try: # 拼接 case_vec.extend(word_vec_model[unicode(word)].tolist()) is_useful = 1 except Exception, e: with open("./data/not_in_vocabulary.txt", 'a') as out_file: # 记录缺失词汇 out_file.write(word + '\n') # 多退少补 if len(case_vec) > 500: case_vec = case_vec[0:500] else: while (len(case_vec) < 500): case_vec.append(0) if is_useful: pos_case_vec_dict[pos_case] = case_vec # 负样本 neg_case_list = case_dict['neg'] neg_case_vec_dict = {} for neg_case in neg_case_list: case_words = segmentor.segment(neg_case) case_vec = [] is_useful = 0 for word in case_words: try: # 拼接 case_vec.extend(word_vec_model[unicode(word)].tolist()) is_useful = 1 except Exception, e: with open("./data/not_in_vocabulary.txt", 'a') as out_file: # 记录缺失词汇 out_file.write(word + '\n') # 多退少补 if len(case_vec) > 500: case_vec = case_vec[0:500] else: while (len(case_vec) < 500): case_vec.append(0) if is_useful: neg_case_vec_dict[neg_case] = case_vec
class PreProcessor(object) : def __init__(self , cws_model_path=CWS_MODEL_PATH , stop_words_dir=STOP_WORDS_DIR) : self.raw_data = None self.processed_data = None self.words_dict = None self.STOP_WORDS = self._load_stop_words(stop_words_dir) self.segmentor = Segmentor() self.segmentor.load(cws_model_path) def _load_stop_words(self , dir_name) : stop_words = set() cur_abs_dir_path = os.path.split(os.path.abspath(__file__))[0] dir_path = os.path.join(cur_abs_dir_path , dir_name) for file_name in os.listdir(dir_path) : file_path = os.path.join(dir_path , file_name) with open(file_path) as f : for line in f : word = line.strip() stop_words.add(word) for symbol in SENT_SPLIT_SYMBOLS : stop_words.add(symbol) return stop_words def load_raw_data(self , path) : with open(path) as f : self.raw_data = json.load(f) def _split_sentence(self , content) : ''' split content to sentence ''' sents = [] paras = content.split("\n") for paragraph in paras : split_rst = re.split(ur"[%s]+" %(SENT_SPLIT_SYMBOLS) , paragraph) # has space sents.extend(split_rst) return sents def _segment(self , unicode_line) : ''' return : list of words ''' utf8_line = unicode_line.strip().encode("utf8") words = list(self.segmentor.segment(utf8_line)) return words def _make_doc_data(self , url , title_seged , sents_seged) : return { 'url' : url , 'title' : title_seged , 'content' : sents_seged } def _add_word2words_dict(self , words) : for word in words : if word not in self.STOP_WORDS : word = word.lower() self.words_dict.add(word) def do_preprocessing(self) : logging.info("do preprocessing ...") self.processed_data = dict() self.words_dict = set() for page_id , page_data in self.raw_data.items() : url = page_data['url'] title = page_data["title"] content = page_data["content"] sents = self._split_sentence(content) # segment title_words = self._segment(title) content_words = [] for sent in sents : content_words.extend(self._segment(sent)) content_words.append(" ") # another space to avoid that they become one line when merging at output snippet self.processed_data[page_id] = self._make_doc_data(url , title_words , content_words) self._add_word2words_dict(title_words + content_words) logging.info('done.') def save_doc_data(self , to_path) : logging.info("saving doc data to ` %s `" %(to_path) ) with open(to_path , 'w') as of: json.dump(self.processed_data , of ) logging.info("done.") def save_words_dict(self , to_path) : logging.info("saving words dict to ` %s `" %(to_path)) words_list = list(self.words_dict) words_dict = {word : word_id for word_id , word in enumerate(words_list) } with open(to_path , 'w') as of : json.dump(words_dict , of , ensure_ascii=False) # json not support `set` logging.info("done.")
from pyltp import Segmentor from pyltp import Postagger seg = Segmentor() seg.load("../ltp_model/cws.model") words = seg.segment("你是那人间的四月天。") print("| ".join(words)) split_word = ' '.join(words) pos = Postagger() pos.load("../ltp_model/pos.model") postags = pos.postag(split_word) for word, postag in zip(split_word, postags): print(word + '/' + postag, end=' ')
class Ltp(object): """docstring for Ltp""" def __init__(self): print 'Ltp:: __init__' self.segmentor = Segmentor() self.segmentor.load('model/cws.model') self.lexicon = None # end def load_lexicon(self, path): """ 加载专有名词词典 @params <String> path 词典列表 """ print 'Ltp:: load_lexicon' self.lexicon = Lexicon(path) # end def cut_to_word(self, data, tab_index=0): """ 把文本数据切分成词,返回词列表 @params <list> file模块读取的数据 @params <int> 按制表符分割后,需要分词的文本的索引 @return <list> 词列表 """ print 'Ltp:: cut_to_word' content = list() for line in data: line = line.strip() if line == "": continue text = line.split('\t') if len(text) < tab_index + 1: continue text = text[tab_index] proper_noun_list = list() if self.lexicon: text = self.lexicon.filter(text) words_list = self.segmentor.segment(text) for word in words_list: content.append(word) print 'Ltp:: cut_to_word done' return content # end def sentence_cut_to_words(self, sentence, tab_index=0): """ 把一句话分词,先不考虑专有名词 """ print 'Ltp:: sentence_cut_to_words' sentence = sentence.strip() if sentence == "": return False text = sentence.split('\t') if len(text) < tab_index + 1: return False text = text[tab_index] words_line = "" words_list = self.segmentor.segment(text) for word in words_list: words_line += word + " " return words_line # end def article_cut_to_words(self, article, tab_index=0): """ 把一个文本文件分词 @params article 是file读取的文本数据 """ print 'Ltp:: article_cut_to_words' content = "" for line in article: sentence = self.sentence_cut_to_words(line, tab_index) if sentence is False: continue content += sentence + "\n" return content # end def get_word_freq(self, words_list): """ 统计词频 @params <list> words_list 词列表 @return <dict> word_freq_dict 词频字典 """ print 'Ltp:: get_word_freq' word_freq_dict = dict() for word in words_list: if not word_freq_dict.has_key(word): word_freq_dict[word] = 0 word_freq_dict[word] += 1 print 'Ltp:: get_word_freq done' return word_freq_dict
def segment(self, sent): segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sent) seg_sent = " ".join(words) return seg_sent
ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
class pnn_count(): def __init__(self): self.mydict = {} self.lines = [] self.lines_num = 3000 self.c = [0,0,0] #PNN self.w_c = [{},{},{}] self.segmentor = Segmentor() self.segmentor.load('cws.model') self.read_file() self.train() self.test() def read_file(self): f = open('pnn_annotated.txt','r') self.lines = f.readlines() f.close() def train(self): for i in range(0,self.lines_num/5*4): line = self.lines[i] line.strip('\n') line_array = line.split('\t') line = line_array[1] words = self.segmentor.segment(line) if line_array[0] == '1': pos = 0 elif line_array[0] =='0': pos = 1 else: pos = 2 for i in words: #calculate frequency if self.w_c[pos].has_key(i): self.w_c[pos][i] += 1 else: for a in range(0,3): self.w_c[a][i] = 0 self.w_c[pos][i] += 1 self.c[pos] += 1 def test(self): count = 0 v = len(self.mydict.keys()) for a in range(self.lines_num / 5 * 4, len(self.lines)-1): wholeline = self.lines[a] print wholeline result = [0.0,0.0,0.0] line_array = wholeline.split('\t') line = line_array[1] words = self.segmentor.segment(line) for i in range(0,3): pci = 1.0 * self.c[i] / (self.lines_num/5 *4) pwci = 1.0 sum_i = 0 for q in self.w_c[i].keys(): sum_i += self.w_c[i][q] for k in words: if self.w_c[i].has_key(k): pwci = pwci * (self.w_c[i][k] + 1) / (sum_i + v) result[i] = pci * pwci maxi = 0 for i in range(0,3): if result[i]>result[maxi]: maxi = i if maxi ==0: if line_array[0] == '1': count += 1 print "my guess is positive" elif maxi==1: if line_array[0] == '0': count += 1 print "my guess is neuter" else: if line_array[0] == '-1': count += 1 print "my guess is negative" print count * 1.0 /(self.lines_num/5)